diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,164948 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "eval_steps": 500, + "global_step": 29980, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "loss": 22.082683563232422, + "step": 0 + }, + { + "ce_loss": 4.019954204559326, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.1744863986968994, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 3.622408151626587, + "step": 0 + }, + { + "epoch": 0, + "loss": 21.855388641357422, + "step": 0 + }, + { + "ce_loss": 3.952629566192627, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.1908594369888306, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 3.536539077758789, + "step": 0 + }, + { + "epoch": 0.00333555703802535, + "loss": 18.8512, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "grad_norm": 346.46929931640625, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "learning_rate": 8.88888888888889e-06, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 11.710114479064941, + "step": 10 + }, + { + "ce_loss": 3.7606523036956787, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.2074427604675293, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 3.6196365356445312, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 10.445865631103516, + "step": 10 + }, + { + "ce_loss": 3.6248669624328613, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.240032434463501, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 3.5363941192626953, + "step": 10 + }, + { + "epoch": 0.0066711140760507, + "loss": 8.6757, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "grad_norm": 35.1971435546875, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "learning_rate": 1.777777777777778e-05, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 5.543203830718994, + "step": 20 + }, + { + "ce_loss": 1.736590027809143, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.0234243869781494, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 2.7691500186920166, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 5.789950847625732, + "step": 20 + }, + { + "ce_loss": 1.7167848348617554, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.1363052129745483, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 2.6627798080444336, + "step": 20 + }, + { + "epoch": 0.01000667111407605, + "loss": 4.629, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "grad_norm": 20.973167419433594, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "learning_rate": 2.6666666666666667e-05, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 3.992175579071045, + "step": 30 + }, + { + "ce_loss": 0.5917344689369202, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 0.7147521376609802, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 1.385014295578003, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 3.1635243892669678, + "step": 30 + }, + { + "ce_loss": 0.6208186745643616, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 0.8098734021186829, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 1.298578143119812, + "step": 30 + }, + { + "epoch": 0.0133422281521014, + "loss": 2.6165, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "grad_norm": 3.0407230854034424, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "learning_rate": 3.555555555555556e-05, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 2.3902668952941895, + "step": 40 + }, + { + "ce_loss": 0.6940972208976746, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 0.5506912469863892, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 0.6016234159469604, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 1.8760014772415161, + "step": 40 + }, + { + "ce_loss": 0.6062495112419128, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 0.5111391544342041, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 0.5445473194122314, + "step": 40 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.0494, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "grad_norm": 2.9749488830566406, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "learning_rate": 4.4444444444444447e-05, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 1.7558726072311401, + "step": 50 + }, + { + "ce_loss": 0.5625492334365845, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 0.5216071605682373, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.4841041564941406, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.287813663482666, + "step": 50 + }, + { + "ce_loss": 0.5892652869224548, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 0.6219286322593689, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.3849905729293823, + "step": 50 + }, + { + "epoch": 0.0200133422281521, + "loss": 1.9326, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "grad_norm": 2.8535168170928955, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "learning_rate": 5.333333333333333e-05, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 1.7353334426879883, + "step": 60 + }, + { + "ce_loss": 0.6446412801742554, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.4365006685256958, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.39507171511650085, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 1.2849175930023193, + "step": 60 + }, + { + "ce_loss": 0.5129432082176208, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.4272335469722748, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.3446718752384186, + "step": 60 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.7874, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "grad_norm": 3.4548161029815674, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "learning_rate": 6.222222222222222e-05, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.2661957740783691, + "step": 70 + }, + { + "ce_loss": 0.5155786275863647, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.44635701179504395, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.3041374385356903, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.6170333623886108, + "step": 70 + }, + { + "ce_loss": 0.5844157338142395, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.48108649253845215, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.35459014773368835, + "step": 70 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.7319, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "grad_norm": 3.476360321044922, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "learning_rate": 7.111111111111112e-05, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.6175156831741333, + "step": 80 + }, + { + "ce_loss": 0.6071637272834778, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.46768084168434143, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.36896222829818726, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.8820009231567383, + "step": 80 + }, + { + "ce_loss": 0.6206005215644836, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.48498210310935974, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.35933607816696167, + "step": 80 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.8923, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "grad_norm": 3.8720743656158447, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "learning_rate": 8e-05, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.8874759674072266, + "step": 90 + }, + { + "ce_loss": 0.5756715536117554, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.506288468837738, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.3379663825035095, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.5637786388397217, + "step": 90 + }, + { + "ce_loss": 0.4854426383972168, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.5292065739631653, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.31842878460884094, + "step": 90 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.863, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "grad_norm": 4.6447319984436035, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "learning_rate": 8.888888888888889e-05, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.3307164907455444, + "step": 100 + }, + { + "ce_loss": 0.5048717856407166, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.4587930738925934, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.3670507073402405, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.9730502367019653, + "step": 100 + }, + { + "ce_loss": 0.6307158470153809, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.5659656524658203, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.3052903413772583, + "step": 100 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.7245, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "grad_norm": 2.048370122909546, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "learning_rate": 9.777777777777778e-05, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.917222499847412, + "step": 110 + }, + { + "ce_loss": 0.5229926705360413, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.4720943868160248, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.28838008642196655, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.9603099822998047, + "step": 110 + }, + { + "ce_loss": 0.5854602456092834, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.5150183439254761, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.34388861060142517, + "step": 110 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.8773, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "grad_norm": 5.81011962890625, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "learning_rate": 0.00010666666666666667, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.600690245628357, + "step": 120 + }, + { + "ce_loss": 0.5576359629631042, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.49549993872642517, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.347184419631958, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 2.348037004470825, + "step": 120 + }, + { + "ce_loss": 0.5738920569419861, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.4258148670196533, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.35311853885650635, + "step": 120 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.775, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "grad_norm": 5.108306407928467, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "learning_rate": 0.00011555555555555555, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.927753210067749, + "step": 130 + }, + { + "ce_loss": 0.5603481531143188, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.5085574388504028, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.24505111575126648, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.2074342966079712, + "step": 130 + }, + { + "ce_loss": 0.4739816188812256, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.40206992626190186, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.3313703238964081, + "step": 130 + }, + { + "epoch": 0.0466977985323549, + "loss": 1.8075, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "grad_norm": 5.600295066833496, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "learning_rate": 0.00012444444444444444, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 3.315217971801758, + "step": 140 + }, + { + "ce_loss": 0.5632176995277405, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.46269339323043823, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.3052278161048889, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 2.9743146896362305, + "step": 140 + }, + { + "ce_loss": 0.6905938386917114, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.49012720584869385, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.32296621799468994, + "step": 140 + }, + { + "epoch": 0.05003335557038025, + "loss": 1.9235, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "grad_norm": 2.0878727436065674, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "learning_rate": 0.00013333333333333334, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 2.5374321937561035, + "step": 150 + }, + { + "ce_loss": 0.5370458364486694, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.47549453377723694, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.26440954208374023, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 1.6627169847488403, + "step": 150 + }, + { + "ce_loss": 0.5243386626243591, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.47431784868240356, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.3353169560432434, + "step": 150 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.7777, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "grad_norm": 1.928293228149414, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "learning_rate": 0.00014222222222222224, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.6847267150878906, + "step": 160 + }, + { + "ce_loss": 0.5684657692909241, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.5586349368095398, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.268443763256073, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.8770315647125244, + "step": 160 + }, + { + "ce_loss": 0.470410019159317, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.47870075702667236, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.27711212635040283, + "step": 160 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.8477, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "grad_norm": 3.7592098712921143, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "learning_rate": 0.0001511111111111111, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.6315332651138306, + "step": 170 + }, + { + "ce_loss": 0.5627481937408447, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.5033910274505615, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.3061639964580536, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.5741348266601562, + "step": 170 + }, + { + "ce_loss": 0.4609927833080292, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.468230277299881, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.24796752631664276, + "step": 170 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.7242, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "grad_norm": 1.7700247764587402, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "learning_rate": 0.00016, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.34260094165802, + "step": 180 + }, + { + "ce_loss": 0.5201642513275146, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.522593080997467, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.29970839619636536, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.6730177402496338, + "step": 180 + }, + { + "ce_loss": 0.5037712454795837, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.5402393937110901, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.3126290440559387, + "step": 180 + }, + { + "epoch": 0.06337558372248166, + "loss": 1.6673, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "grad_norm": 1.8502429723739624, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "learning_rate": 0.00016888888888888889, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 1.8196762800216675, + "step": 190 + }, + { + "ce_loss": 0.5710945725440979, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.5098980069160461, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.2702150046825409, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 1.9046032428741455, + "step": 190 + }, + { + "ce_loss": 0.6237345337867737, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.5800604224205017, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.2864050567150116, + "step": 190 + }, + { + "epoch": 0.066711140760507, + "loss": 1.7973, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "grad_norm": 1.8052473068237305, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "learning_rate": 0.00017777777777777779, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 2.0465126037597656, + "step": 200 + }, + { + "ce_loss": 0.571826696395874, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.5833513140678406, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.27391302585601807, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 2.0846211910247803, + "step": 200 + }, + { + "ce_loss": 0.6135091185569763, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.5756477117538452, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.30294105410575867, + "step": 200 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.7797, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "grad_norm": 3.2647974491119385, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "learning_rate": 0.0001866666666666667, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.6143054962158203, + "step": 210 + }, + { + "ce_loss": 0.48211437463760376, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.5302156209945679, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.24415576457977295, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.7094444036483765, + "step": 210 + }, + { + "ce_loss": 0.5260207056999207, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.4881989359855652, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.3114811182022095, + "step": 210 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.7491, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "grad_norm": 3.6222076416015625, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "learning_rate": 0.00019555555555555556, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.7036691904067993, + "step": 220 + }, + { + "ce_loss": 0.5687296390533447, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.6326914429664612, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.295692503452301, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.9834920167922974, + "step": 220 + }, + { + "ce_loss": 0.5344058871269226, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.5521599054336548, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.29361093044281006, + "step": 220 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.7304, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "grad_norm": 21.085020065307617, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "learning_rate": 0.00020444444444444443, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.725504755973816, + "step": 230 + }, + { + "ce_loss": 0.6624048352241516, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.6723373532295227, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.39068496227264404, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 2.677776336669922, + "step": 230 + }, + { + "ce_loss": 0.7152729034423828, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.7655081748962402, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.3652455508708954, + "step": 230 + }, + { + "epoch": 0.0800533689126084, + "loss": 2.1082, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "grad_norm": 5.6974921226501465, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "learning_rate": 0.00021333333333333333, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.5531107187271118, + "step": 240 + }, + { + "ce_loss": 0.5192185044288635, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.5373853445053101, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.2714692950248718, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.5598499774932861, + "step": 240 + }, + { + "ce_loss": 0.5087171196937561, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.5728287696838379, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.2059345841407776, + "step": 240 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.6852, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "grad_norm": 1.2824687957763672, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "learning_rate": 0.00022222222222222223, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 2.172323226928711, + "step": 250 + }, + { + "ce_loss": 0.6075947880744934, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.5816372632980347, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.2618728280067444, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.9809627532958984, + "step": 250 + }, + { + "ce_loss": 0.4714495539665222, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.48982515931129456, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.2820373475551605, + "step": 250 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.8546, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "grad_norm": 2.8757669925689697, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "learning_rate": 0.0002311111111111111, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 2.353008270263672, + "step": 260 + }, + { + "ce_loss": 0.5781505107879639, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.5666728615760803, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.2690242826938629, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.535614013671875, + "step": 260 + }, + { + "ce_loss": 0.47560837864875793, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.5879183411598206, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.28443270921707153, + "step": 260 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.7228, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "grad_norm": 3.3693370819091797, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "learning_rate": 0.00024, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.65613853931427, + "step": 270 + }, + { + "ce_loss": 0.509759783744812, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.6900843381881714, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.21354441344738007, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.7907038927078247, + "step": 270 + }, + { + "ce_loss": 0.47750452160835266, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.5887446403503418, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.25874146819114685, + "step": 270 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.5949, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "grad_norm": 1.3736844062805176, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "learning_rate": 0.0002488888888888889, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.6113965511322021, + "step": 280 + }, + { + "ce_loss": 0.569277822971344, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.6352401971817017, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.23418693244457245, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.9638503789901733, + "step": 280 + }, + { + "ce_loss": 0.5025474429130554, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.6397897601127625, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.24622640013694763, + "step": 280 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.6697, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "grad_norm": 1.6431907415390015, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "learning_rate": 0.00025777777777777783, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 2.6338305473327637, + "step": 290 + }, + { + "ce_loss": 0.5702818036079407, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.5806958079338074, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.21621885895729065, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.3259646892547607, + "step": 290 + }, + { + "ce_loss": 0.42298078536987305, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.5434897541999817, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.24230249226093292, + "step": 290 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.7942, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "grad_norm": 1.4105414152145386, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "learning_rate": 0.0002666666666666667, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.7872378826141357, + "step": 300 + }, + { + "ce_loss": 0.49962472915649414, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.5508493781089783, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.2687307596206665, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.2783927917480469, + "step": 300 + }, + { + "ce_loss": 0.4945143759250641, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.5173062086105347, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.2663983404636383, + "step": 300 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.802, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "grad_norm": 2.590189218521118, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "learning_rate": 0.0002755555555555556, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 2.033294916152954, + "step": 310 + }, + { + "ce_loss": 0.48483648896217346, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.5946583151817322, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.24107787013053894, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.7032603025436401, + "step": 310 + }, + { + "ce_loss": 0.4977302849292755, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.5834342241287231, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.2511594593524933, + "step": 310 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.8593, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "grad_norm": 1.435152292251587, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "learning_rate": 0.0002844444444444445, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.522861123085022, + "step": 320 + }, + { + "ce_loss": 0.5201138854026794, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.5344372391700745, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.27378469705581665, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.5926438570022583, + "step": 320 + }, + { + "ce_loss": 0.4458596408367157, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.48734861612319946, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.32077357172966003, + "step": 320 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.5826, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "grad_norm": 1.7621814012527466, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "learning_rate": 0.0002933333333333333, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.753835916519165, + "step": 330 + }, + { + "ce_loss": 0.5066315531730652, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.4909520149230957, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.28438347578048706, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.2814210653305054, + "step": 330 + }, + { + "ce_loss": 0.4937293231487274, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.593289852142334, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.19415561854839325, + "step": 330 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.7061, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "grad_norm": 2.5554006099700928, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "learning_rate": 0.0003022222222222222, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.844890832901001, + "step": 340 + }, + { + "ce_loss": 0.6287904381752014, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.694754958152771, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.2634267807006836, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.6410373449325562, + "step": 340 + }, + { + "ce_loss": 0.5369527339935303, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.6398927569389343, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.27201586961746216, + "step": 340 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.5979, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "grad_norm": 1.9453589916229248, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "learning_rate": 0.0003111111111111111, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.646355390548706, + "step": 350 + }, + { + "ce_loss": 0.5751283168792725, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.6439217329025269, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.24644474685192108, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.6202154159545898, + "step": 350 + }, + { + "ce_loss": 0.5422464609146118, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.6799194812774658, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.24791789054870605, + "step": 350 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.6728, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "grad_norm": 1.7183290719985962, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "learning_rate": 0.00032, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.585992693901062, + "step": 360 + }, + { + "ce_loss": 0.5894421935081482, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.5706028342247009, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.27797845005989075, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.7014429569244385, + "step": 360 + }, + { + "ce_loss": 0.4603886604309082, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.6570786833763123, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.22744178771972656, + "step": 360 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.5367, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "grad_norm": 1.3087623119354248, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "learning_rate": 0.00032888888888888887, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 2.3369619846343994, + "step": 370 + }, + { + "ce_loss": 0.6497780084609985, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.646597146987915, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.3300057351589203, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.4946203231811523, + "step": 370 + }, + { + "ce_loss": 0.4301724433898926, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.6341826319694519, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.24294087290763855, + "step": 370 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.6486, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "grad_norm": 1.720984697341919, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "learning_rate": 0.00033777777777777777, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.5315797328948975, + "step": 380 + }, + { + "ce_loss": 0.4742905795574188, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.5379242300987244, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.2992285490036011, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.4294133186340332, + "step": 380 + }, + { + "ce_loss": 0.543543815612793, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.5854701399803162, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.29929932951927185, + "step": 380 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.497, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "grad_norm": 1.4799915552139282, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "learning_rate": 0.00034666666666666667, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.6086652278900146, + "step": 390 + }, + { + "ce_loss": 0.4943905174732208, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.6120883226394653, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.2651229500770569, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.7222286462783813, + "step": 390 + }, + { + "ce_loss": 0.4503355622291565, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.521122932434082, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.2309369146823883, + "step": 390 + }, + { + "epoch": 0.133422281521014, + "loss": 1.6039, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "grad_norm": 2.7860965728759766, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "learning_rate": 0.00035555555555555557, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 2.0828051567077637, + "step": 400 + }, + { + "ce_loss": 0.5030226111412048, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.5764296650886536, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.2733519673347473, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 1.5071513652801514, + "step": 400 + }, + { + "ce_loss": 0.48920971155166626, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.5897278785705566, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.2842494249343872, + "step": 400 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.8246, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "grad_norm": 4.629027366638184, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "learning_rate": 0.00036444444444444447, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.933959722518921, + "step": 410 + }, + { + "ce_loss": 0.4567517638206482, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.6056955456733704, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.23806793987751007, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.6946505308151245, + "step": 410 + }, + { + "ce_loss": 0.5993149280548096, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.7280476093292236, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.22402344644069672, + "step": 410 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.6325, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "grad_norm": 3.0605082511901855, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "learning_rate": 0.0003733333333333334, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.3284132480621338, + "step": 420 + }, + { + "ce_loss": 0.495491623878479, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.5443464517593384, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.2884868383407593, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.2195791006088257, + "step": 420 + }, + { + "ce_loss": 0.44059133529663086, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.4888474941253662, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.2899805009365082, + "step": 420 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.6223, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "grad_norm": 3.7689526081085205, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "learning_rate": 0.0003822222222222223, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.595637559890747, + "step": 430 + }, + { + "ce_loss": 0.544774055480957, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.515714168548584, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.24679963290691376, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.3475347757339478, + "step": 430 + }, + { + "ce_loss": 0.44207265973091125, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.4540649354457855, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.23646868765354156, + "step": 430 + }, + { + "epoch": 0.1467645096731154, + "loss": 1.644, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "grad_norm": 1.7760794162750244, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "learning_rate": 0.0003911111111111111, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 1.3347100019454956, + "step": 440 + }, + { + "ce_loss": 0.4692061245441437, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.6326097846031189, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.23280993103981018, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 1.2680646181106567, + "step": 440 + }, + { + "ce_loss": 0.44068577885627747, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.5154201984405518, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.3117990493774414, + "step": 440 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.593, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "grad_norm": 2.2271616458892822, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "learning_rate": 0.0004, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.6983128786087036, + "step": 450 + }, + { + "ce_loss": 0.4268217980861664, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.4826546013355255, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.26377132534980774, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 2.1044223308563232, + "step": 450 + }, + { + "ce_loss": 0.4926432967185974, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.5348109006881714, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.2828768491744995, + "step": 450 + }, + { + "epoch": 0.1534356237491661, + "loss": 1.6717, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "grad_norm": 1.3050262928009033, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "learning_rate": 0.00040888888888888887, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 1.4850102663040161, + "step": 460 + }, + { + "ce_loss": 0.3940463066101074, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.4585028290748596, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.19937008619308472, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 1.4205468893051147, + "step": 460 + }, + { + "ce_loss": 0.47472071647644043, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.5782334804534912, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.21085552871227264, + "step": 460 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.5714, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "grad_norm": 1.5946571826934814, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "learning_rate": 0.0004177777777777778, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.7707476615905762, + "step": 470 + }, + { + "ce_loss": 0.5047350525856018, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.6464635729789734, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.28190878033638, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.366986632347107, + "step": 470 + }, + { + "ce_loss": 0.45065632462501526, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.5560937523841858, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.23909518122673035, + "step": 470 + }, + { + "epoch": 0.1601067378252168, + "loss": 1.6082, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "grad_norm": 1.7290911674499512, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "learning_rate": 0.00042666666666666667, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 1.3324958086013794, + "step": 480 + }, + { + "ce_loss": 0.378185898065567, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.5746075510978699, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.24356596171855927, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 1.4968345165252686, + "step": 480 + }, + { + "ce_loss": 0.4363265931606293, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.5504066348075867, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.19955141842365265, + "step": 480 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.5729, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "grad_norm": 1.1001335382461548, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "learning_rate": 0.00043555555555555557, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.4524122476577759, + "step": 490 + }, + { + "ce_loss": 0.5018464922904968, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.5495221018791199, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.2558499574661255, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.4316108226776123, + "step": 490 + }, + { + "ce_loss": 0.45365622639656067, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.5579233765602112, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.2731827199459076, + "step": 490 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.4884, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "grad_norm": 1.5505117177963257, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "learning_rate": 0.00044444444444444447, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.3067128658294678, + "step": 500 + }, + { + "ce_loss": 0.433439165353775, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.4941195547580719, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.2596491873264313, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 2.249636650085449, + "step": 500 + }, + { + "ce_loss": 0.49755096435546875, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.49061688780784607, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.27567821741104126, + "step": 500 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.6774, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "grad_norm": 1.3292597532272339, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "learning_rate": 0.00045333333333333337, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.595599889755249, + "step": 510 + }, + { + "ce_loss": 0.5413509607315063, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.6084362268447876, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.295004278421402, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.9175055027008057, + "step": 510 + }, + { + "ce_loss": 0.4983865022659302, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.5698104500770569, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.3239087760448456, + "step": 510 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.5642, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "grad_norm": 1.2634004354476929, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "learning_rate": 0.0004622222222222222, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.9376397132873535, + "step": 520 + }, + { + "ce_loss": 0.5932291746139526, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.46436649560928345, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.2764166295528412, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 2.0309195518493652, + "step": 520 + }, + { + "ce_loss": 0.5371260643005371, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.517487645149231, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.19353549182415009, + "step": 520 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.653, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "grad_norm": 2.620346784591675, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "learning_rate": 0.00047111111111111117, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.550959825515747, + "step": 530 + }, + { + "ce_loss": 0.47280043363571167, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.543157696723938, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.21153424680233002, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.3030387163162231, + "step": 530 + }, + { + "ce_loss": 0.5019891262054443, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.5249347686767578, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.2558169960975647, + "step": 530 + }, + { + "epoch": 0.1801200800533689, + "loss": 1.6, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "grad_norm": 1.2312290668487549, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "learning_rate": 0.00048, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 1.629996418952942, + "step": 540 + }, + { + "ce_loss": 0.47734594345092773, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.5398337841033936, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.23363655805587769, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 1.3013463020324707, + "step": 540 + }, + { + "ce_loss": 0.5037491321563721, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.5337326526641846, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.25985610485076904, + "step": 540 + }, + { + "epoch": 0.18345563709139426, + "loss": 1.6104, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "grad_norm": 3.5466859340667725, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "learning_rate": 0.000488888888888889, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 1.25911283493042, + "step": 550 + }, + { + "ce_loss": 0.4432990550994873, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.5283195972442627, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.28703317046165466, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 1.4867393970489502, + "step": 550 + }, + { + "ce_loss": 0.46072185039520264, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.6002609729766846, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.3060095012187958, + "step": 550 + }, + { + "epoch": 0.1867911941294196, + "loss": 1.5295, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "grad_norm": 2.128678798675537, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "learning_rate": 0.0004977777777777778, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 1.3652108907699585, + "step": 560 + }, + { + "ce_loss": 0.41976261138916016, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.5678168535232544, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.2214897722005844, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 1.4672818183898926, + "step": 560 + }, + { + "ce_loss": 0.5104706287384033, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.5782572031021118, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.2707638740539551, + "step": 560 + }, + { + "epoch": 0.19012675116744496, + "loss": 1.5453, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "grad_norm": 2.144012451171875, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "learning_rate": 0.0005066666666666667, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 2.1711885929107666, + "step": 570 + }, + { + "ce_loss": 0.4930206835269928, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.5990675091743469, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.2547401189804077, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 1.7919820547103882, + "step": 570 + }, + { + "ce_loss": 0.485125333070755, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.5928157567977905, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.25338858366012573, + "step": 570 + }, + { + "epoch": 0.1934623082054703, + "loss": 1.5982, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "grad_norm": 1.5169459581375122, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "learning_rate": 0.0005155555555555557, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 1.3269596099853516, + "step": 580 + }, + { + "ce_loss": 0.5053694248199463, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.5650742650032043, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.2564832866191864, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 1.6118474006652832, + "step": 580 + }, + { + "ce_loss": 0.4108356535434723, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.6295657157897949, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.2287975251674652, + "step": 580 + }, + { + "epoch": 0.19679786524349566, + "loss": 1.6172, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "grad_norm": 1.4867298603057861, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "learning_rate": 0.0005244444444444445, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 1.4855684041976929, + "step": 590 + }, + { + "ce_loss": 0.4735114574432373, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.6113525032997131, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.2961452901363373, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 1.7030887603759766, + "step": 590 + }, + { + "ce_loss": 0.4876049757003784, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.5494129061698914, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.29730701446533203, + "step": 590 + }, + { + "epoch": 0.200133422281521, + "loss": 1.5749, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "grad_norm": 2.0874152183532715, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "learning_rate": 0.0005333333333333334, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 1.338006615638733, + "step": 600 + }, + { + "ce_loss": 0.4650641083717346, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.4777800440788269, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.2336799055337906, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 1.4985271692276, + "step": 600 + }, + { + "ce_loss": 0.5751738548278809, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.46163544058799744, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.3208712041378021, + "step": 600 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.5238, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "grad_norm": 1.4101930856704712, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "learning_rate": 0.0005422222222222223, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.7546379566192627, + "step": 610 + }, + { + "ce_loss": 0.460560142993927, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.5301985144615173, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.31578245759010315, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.3197903633117676, + "step": 610 + }, + { + "ce_loss": 0.43814700841903687, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.6068532466888428, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.25644248723983765, + "step": 610 + }, + { + "epoch": 0.2068045363575717, + "loss": 1.7156, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "grad_norm": 2.0110700130462646, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "learning_rate": 0.0005511111111111112, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 1.3195078372955322, + "step": 620 + }, + { + "ce_loss": 0.4470673203468323, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.5740320086479187, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.2269422858953476, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 1.3146237134933472, + "step": 620 + }, + { + "ce_loss": 0.4497494101524353, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.6298726201057434, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.2145850658416748, + "step": 620 + }, + { + "epoch": 0.21014009339559706, + "loss": 1.5681, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "grad_norm": 1.196976900100708, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "learning_rate": 0.00056, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 1.5452654361724854, + "step": 630 + }, + { + "ce_loss": 0.42693886160850525, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.6515905857086182, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.3077460825443268, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 1.7794028520584106, + "step": 630 + }, + { + "ce_loss": 0.4400431513786316, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.6112064123153687, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.19944915175437927, + "step": 630 + }, + { + "epoch": 0.2134756504336224, + "loss": 1.6714, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "grad_norm": 2.2573142051696777, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "learning_rate": 0.000568888888888889, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 1.4484288692474365, + "step": 640 + }, + { + "ce_loss": 0.4527072310447693, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.4269205927848816, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.3586342930793762, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 1.2968647480010986, + "step": 640 + }, + { + "ce_loss": 0.4351823627948761, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.4391202926635742, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.32410529255867004, + "step": 640 + }, + { + "epoch": 0.21681120747164775, + "loss": 1.6453, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "grad_norm": 1.68989098072052, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "learning_rate": 0.0005777777777777778, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 1.443930983543396, + "step": 650 + }, + { + "ce_loss": 0.4861678183078766, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.6249301433563232, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.21519602835178375, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 1.4586472511291504, + "step": 650 + }, + { + "ce_loss": 0.42792659997940063, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.6605918407440186, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.26281821727752686, + "step": 650 + }, + { + "epoch": 0.2201467645096731, + "loss": 1.6017, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "grad_norm": 2.7463796138763428, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "learning_rate": 0.0005866666666666667, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 1.276236653327942, + "step": 660 + }, + { + "ce_loss": 0.45802658796310425, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.5395572185516357, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.27748921513557434, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 2.1881327629089355, + "step": 660 + }, + { + "ce_loss": 0.47671204805374146, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.5150418877601624, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.27915847301483154, + "step": 660 + }, + { + "epoch": 0.22348232154769845, + "loss": 1.5746, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "grad_norm": 1.6654798984527588, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "learning_rate": 0.0005955555555555556, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 1.593944787979126, + "step": 670 + }, + { + "ce_loss": 0.4207690954208374, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.6195393800735474, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.26322174072265625, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 1.866980791091919, + "step": 670 + }, + { + "ce_loss": 0.5242395401000977, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.692948043346405, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.2934406101703644, + "step": 670 + }, + { + "epoch": 0.2268178785857238, + "loss": 1.5249, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "grad_norm": 1.435062289237976, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "learning_rate": 0.0006044444444444445, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 1.4246501922607422, + "step": 680 + }, + { + "ce_loss": 0.4549441635608673, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.5518971681594849, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.2823745608329773, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 2.719393253326416, + "step": 680 + }, + { + "ce_loss": 0.5663154721260071, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.6299650073051453, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.2891238033771515, + "step": 680 + }, + { + "epoch": 0.23015343562374915, + "loss": 1.7289, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "grad_norm": 2.50449538230896, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "learning_rate": 0.0006133333333333334, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 1.3545693159103394, + "step": 690 + }, + { + "ce_loss": 0.5105953812599182, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.5700008869171143, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.2721017599105835, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 1.2746974229812622, + "step": 690 + }, + { + "ce_loss": 0.48519670963287354, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.5226454138755798, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.26627910137176514, + "step": 690 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.6944, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "grad_norm": 1.9347381591796875, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "learning_rate": 0.0006222222222222223, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.4545104503631592, + "step": 700 + }, + { + "ce_loss": 0.4468264579772949, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.4760701656341553, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.2496543526649475, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.7434810400009155, + "step": 700 + }, + { + "ce_loss": 0.5025457739830017, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.5106308460235596, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.32452499866485596, + "step": 700 + }, + { + "epoch": 0.23682454969979988, + "loss": 1.626, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "grad_norm": 1.789971113204956, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "learning_rate": 0.0006311111111111112, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 1.2687652111053467, + "step": 710 + }, + { + "ce_loss": 0.49126219749450684, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.4775649607181549, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.2650153636932373, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 1.09128737449646, + "step": 710 + }, + { + "ce_loss": 0.4264126718044281, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.40314796566963196, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.2587401270866394, + "step": 710 + }, + { + "epoch": 0.24016010673782523, + "loss": 1.6084, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "grad_norm": 2.9466187953948975, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "learning_rate": 0.00064, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 1.9708633422851562, + "step": 720 + }, + { + "ce_loss": 0.45224499702453613, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.4592093229293823, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.2654026448726654, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 1.2625981569290161, + "step": 720 + }, + { + "ce_loss": 0.41097569465637207, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.4138302206993103, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.2421134114265442, + "step": 720 + }, + { + "epoch": 0.24349566377585058, + "loss": 1.528, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "grad_norm": 1.8387153148651123, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "learning_rate": 0.000648888888888889, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 1.3539307117462158, + "step": 730 + }, + { + "ce_loss": 0.4820309579372406, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.6259849667549133, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.22844180464744568, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 1.6995915174484253, + "step": 730 + }, + { + "ce_loss": 0.41940057277679443, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.6038891673088074, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.26597100496292114, + "step": 730 + }, + { + "epoch": 0.24683122081387593, + "loss": 1.5918, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "grad_norm": 1.9388843774795532, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "learning_rate": 0.0006577777777777777, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 1.5610748529434204, + "step": 740 + }, + { + "ce_loss": 0.45109638571739197, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.5143248438835144, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.2895212769508362, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 1.315789818763733, + "step": 740 + }, + { + "ce_loss": 0.40198326110839844, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.5153144598007202, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.2582947015762329, + "step": 740 + }, + { + "epoch": 0.2501667778519013, + "loss": 1.5719, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "grad_norm": 1.69242525100708, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "learning_rate": 0.0006666666666666668, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 2.689157485961914, + "step": 750 + }, + { + "ce_loss": 0.4594300091266632, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.6778824925422668, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.30148231983184814, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 1.4996731281280518, + "step": 750 + }, + { + "ce_loss": 0.45501625537872314, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.6697419285774231, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.2740204632282257, + "step": 750 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.6409, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "grad_norm": 1.9353746175765991, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "learning_rate": 0.0006755555555555555, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.4622337818145752, + "step": 760 + }, + { + "ce_loss": 0.49489298462867737, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.5559422969818115, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.3610493540763855, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.6780763864517212, + "step": 760 + }, + { + "ce_loss": 0.4462796151638031, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.5648516416549683, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.3000343143939972, + "step": 760 + }, + { + "epoch": 0.256837891927952, + "loss": 1.7037, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "grad_norm": 2.1047372817993164, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "learning_rate": 0.0006844444444444444, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 1.5280171632766724, + "step": 770 + }, + { + "ce_loss": 0.42800432443618774, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.5842844843864441, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.2647983133792877, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 1.9386160373687744, + "step": 770 + }, + { + "ce_loss": 0.4129220247268677, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.5319868326187134, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.3261532485485077, + "step": 770 + }, + { + "epoch": 0.2601734489659773, + "loss": 1.5582, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "grad_norm": 1.3958163261413574, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "learning_rate": 0.0006933333333333333, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 1.4289088249206543, + "step": 780 + }, + { + "ce_loss": 0.5142719149589539, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.5124003291130066, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.27058809995651245, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 1.4231754541397095, + "step": 780 + }, + { + "ce_loss": 0.49991029500961304, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.5184733867645264, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.268743634223938, + "step": 780 + }, + { + "epoch": 0.2635090060040027, + "loss": 1.6755, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "grad_norm": 1.575848937034607, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "learning_rate": 0.0007022222222222222, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 1.4275168180465698, + "step": 790 + }, + { + "ce_loss": 0.5029415488243103, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.628352165222168, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.2929801046848297, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 1.449366807937622, + "step": 790 + }, + { + "ce_loss": 0.4599992334842682, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.5531008839607239, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.2969628870487213, + "step": 790 + }, + { + "epoch": 0.266844563042028, + "loss": 1.6387, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "grad_norm": 1.6788091659545898, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "learning_rate": 0.0007111111111111111, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 1.4919285774230957, + "step": 800 + }, + { + "ce_loss": 0.45864376425743103, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.6171651482582092, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.2416432499885559, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 1.9904141426086426, + "step": 800 + }, + { + "ce_loss": 0.46867993474006653, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.6592917442321777, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.28348398208618164, + "step": 800 + }, + { + "epoch": 0.2701801200800534, + "loss": 1.5013, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "grad_norm": 1.3329455852508545, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "learning_rate": 0.00072, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 1.204390525817871, + "step": 810 + }, + { + "ce_loss": 0.3894767463207245, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.52994304895401, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.2811121344566345, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 1.4491932392120361, + "step": 810 + }, + { + "ce_loss": 0.45897600054740906, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.5323480367660522, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.26767823100090027, + "step": 810 + }, + { + "epoch": 0.2735156771180787, + "loss": 1.8356, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "grad_norm": 3.123432159423828, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "learning_rate": 0.0007288888888888889, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 1.3312506675720215, + "step": 820 + }, + { + "ce_loss": 0.43902865052223206, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.44022202491760254, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.3157406449317932, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 1.2697056531906128, + "step": 820 + }, + { + "ce_loss": 0.4552677869796753, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.4594164788722992, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.28923386335372925, + "step": 820 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.7456, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "grad_norm": 2.2865235805511475, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "learning_rate": 0.0007377777777777778, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.4067904949188232, + "step": 830 + }, + { + "ce_loss": 0.3348756432533264, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.6199288368225098, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.23431706428527832, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.6532933712005615, + "step": 830 + }, + { + "ce_loss": 0.5002840757369995, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.6605917811393738, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.319241464138031, + "step": 830 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.7239, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "grad_norm": 1.7840464115142822, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "learning_rate": 0.0007466666666666667, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.3670352697372437, + "step": 840 + }, + { + "ce_loss": 0.4084605574607849, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 0.5723965764045715, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.2615945339202881, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.7874099016189575, + "step": 840 + }, + { + "ce_loss": 0.42878419160842896, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 0.5446598529815674, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.3162139356136322, + "step": 840 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.6546, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "grad_norm": 1.89409601688385, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "learning_rate": 0.0007555555555555555, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.8647241592407227, + "step": 850 + }, + { + "ce_loss": 0.43658173084259033, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.694576621055603, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.2831249535083771, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.6206308603286743, + "step": 850 + }, + { + "ce_loss": 0.4423183798789978, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.6064574122428894, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.2974826395511627, + "step": 850 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.566, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "grad_norm": 2.212101936340332, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "learning_rate": 0.0007644444444444445, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.5993949174880981, + "step": 860 + }, + { + "ce_loss": 0.42447516322135925, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.5617659687995911, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.2637116611003876, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.4178887605667114, + "step": 860 + }, + { + "ce_loss": 0.4780910909175873, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.5067756772041321, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.3166176974773407, + "step": 860 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.565, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "grad_norm": 1.569791316986084, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "learning_rate": 0.0007733333333333333, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 2.097165107727051, + "step": 870 + }, + { + "ce_loss": 0.507722020149231, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.5919543504714966, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.3449063301086426, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.5407272577285767, + "step": 870 + }, + { + "ce_loss": 0.4565356969833374, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.5664113163948059, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.3351632058620453, + "step": 870 + }, + { + "epoch": 0.2935290193462308, + "loss": 1.5408, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "grad_norm": 1.7846498489379883, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "learning_rate": 0.0007822222222222222, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 1.3901203870773315, + "step": 880 + }, + { + "ce_loss": 0.4673023819923401, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.5273351669311523, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.27710044384002686, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 1.1522825956344604, + "step": 880 + }, + { + "ce_loss": 0.34825077652931213, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.45659923553466797, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.24090026319026947, + "step": 880 + }, + { + "epoch": 0.2968645763842562, + "loss": 1.5535, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "grad_norm": 1.74665105342865, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "learning_rate": 0.0007911111111111111, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 1.7669625282287598, + "step": 890 + }, + { + "ce_loss": 0.5134462714195251, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.5571491718292236, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.35539567470550537, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 1.4223843812942505, + "step": 890 + }, + { + "ce_loss": 0.5171000361442566, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.5794808864593506, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.32578277587890625, + "step": 890 + }, + { + "epoch": 0.3002001334222815, + "loss": 1.5978, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "grad_norm": 2.970160484313965, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "learning_rate": 0.0008, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 1.1846829652786255, + "step": 900 + }, + { + "ce_loss": 0.4285282790660858, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.4960606098175049, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.25980421900749207, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 1.5200297832489014, + "step": 900 + }, + { + "ce_loss": 0.5215058922767639, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.5218350291252136, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.3791854977607727, + "step": 900 + }, + { + "epoch": 0.3035356904603069, + "loss": 1.4915, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "grad_norm": 1.6637479066848755, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "learning_rate": 0.0007999997665784792, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 1.5741530656814575, + "step": 910 + }, + { + "ce_loss": 0.42140352725982666, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.5193475484848022, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.26985877752304077, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 1.433876395225525, + "step": 910 + }, + { + "ce_loss": 0.40475451946258545, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.5959672927856445, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.224575012922287, + "step": 910 + }, + { + "epoch": 0.3068712474983322, + "loss": 1.6494, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "grad_norm": 2.6223065853118896, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "learning_rate": 0.0007999990663141889, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 1.777024745941162, + "step": 920 + }, + { + "ce_loss": 0.5548272728919983, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.6639838218688965, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.28127968311309814, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 1.6267937421798706, + "step": 920 + }, + { + "ce_loss": 0.5212595462799072, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.6798348426818848, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.29340386390686035, + "step": 920 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.7391, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "grad_norm": 2.6510393619537354, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "learning_rate": 0.0007999978992079467, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.4878954887390137, + "step": 930 + }, + { + "ce_loss": 0.5326961874961853, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.5999663472175598, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.3550073802471161, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.464622974395752, + "step": 930 + }, + { + "ce_loss": 0.4628712832927704, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.6238007545471191, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.27657976746559143, + "step": 930 + }, + { + "epoch": 0.3135423615743829, + "loss": 1.6287, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "grad_norm": 2.517056465148926, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "learning_rate": 0.0007999962652611144, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 1.8964029550552368, + "step": 940 + }, + { + "ce_loss": 0.4931149482727051, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.6298879384994507, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.33834245800971985, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 1.7234830856323242, + "step": 940 + }, + { + "ce_loss": 0.42898115515708923, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.6838112473487854, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.24234764277935028, + "step": 940 + }, + { + "epoch": 0.31687791861240827, + "loss": 1.6292, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "grad_norm": 2.1587040424346924, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "learning_rate": 0.0007999941644755992, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 1.4840506315231323, + "step": 950 + }, + { + "ce_loss": 0.46642470359802246, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.5663684606552124, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.3459374010562897, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 2.314152240753174, + "step": 950 + }, + { + "ce_loss": 0.4551185369491577, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.5574811100959778, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.2758069634437561, + "step": 950 + }, + { + "epoch": 0.3202134756504336, + "loss": 1.7307, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "grad_norm": 1.7937731742858887, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "learning_rate": 0.0007999915968538529, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 1.4508087635040283, + "step": 960 + }, + { + "ce_loss": 0.501089870929718, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.5429267883300781, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.4066421389579773, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 1.428647756576538, + "step": 960 + }, + { + "ce_loss": 0.4056423008441925, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.6060864925384521, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.28116607666015625, + "step": 960 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.5632, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "grad_norm": 1.5581204891204834, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "learning_rate": 0.0007999885623988721, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.5437523126602173, + "step": 970 + }, + { + "ce_loss": 0.5129994750022888, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.6591445207595825, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.26789551973342896, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.9831459522247314, + "step": 970 + }, + { + "ce_loss": 0.46284744143486023, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.5761644840240479, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.34350600838661194, + "step": 970 + }, + { + "epoch": 0.3268845897264843, + "loss": 1.6733, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "grad_norm": 2.113006830215454, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "learning_rate": 0.0007999850611141987, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 1.4288175106048584, + "step": 980 + }, + { + "ce_loss": 0.4255240559577942, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.5245460271835327, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.32837823033332825, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 1.5835850238800049, + "step": 980 + }, + { + "ce_loss": 0.509605348110199, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.521268367767334, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.34805089235305786, + "step": 980 + }, + { + "epoch": 0.33022014676450967, + "loss": 1.4216, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "grad_norm": 2.097587823867798, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "learning_rate": 0.0007999810930039185, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 1.6663522720336914, + "step": 990 + }, + { + "ce_loss": 0.4322049021720886, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.5957827568054199, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.2946665287017822, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 1.6669708490371704, + "step": 990 + }, + { + "ce_loss": 0.528470516204834, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.6283411979675293, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.3029085397720337, + "step": 990 + }, + { + "epoch": 0.333555703802535, + "loss": 1.669, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "grad_norm": 4.047603607177734, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "learning_rate": 0.0007999766580726633, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.288991928100586, + "step": 1000 + }, + { + "ce_loss": 0.3834209442138672, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.4667183458805084, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.3061845302581787, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.3131588697433472, + "step": 1000 + }, + { + "ce_loss": 0.42746010422706604, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.5145797729492188, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.26582035422325134, + "step": 1000 + }, + { + "epoch": 0.33689126084056037, + "loss": 1.5305, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "grad_norm": 1.7009576559066772, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "learning_rate": 0.0007999717563256087, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 1.335223913192749, + "step": 1010 + }, + { + "ce_loss": 0.42706549167633057, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.5178505778312683, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.27624693512916565, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 1.4561131000518799, + "step": 1010 + }, + { + "ce_loss": 0.46970024704933167, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.5584217309951782, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.3316168785095215, + "step": 1010 + }, + { + "epoch": 0.3402268178785857, + "loss": 1.5415, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "grad_norm": 1.8947089910507202, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "learning_rate": 0.0007999663877684757, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 1.2597054243087769, + "step": 1020 + }, + { + "ce_loss": 0.39339420199394226, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.5205109715461731, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.2417498379945755, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 1.2326630353927612, + "step": 1020 + }, + { + "ce_loss": 0.41173839569091797, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.5313530564308167, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.289476215839386, + "step": 1020 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.495, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "grad_norm": 1.6055141687393188, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "learning_rate": 0.0007999605524075302, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.3744783401489258, + "step": 1030 + }, + { + "ce_loss": 0.4202525019645691, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.5212565064430237, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.3207845687866211, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.725656270980835, + "step": 1030 + }, + { + "ce_loss": 0.41157498955726624, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.6025432348251343, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.2683902084827423, + "step": 1030 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.6478, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "grad_norm": 1.7481364011764526, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "learning_rate": 0.0007999542502495823, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.301788330078125, + "step": 1040 + }, + { + "ce_loss": 0.5349807739257812, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.44870883226394653, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.3173947036266327, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.5995664596557617, + "step": 1040 + }, + { + "ce_loss": 0.4531072974205017, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.44370460510253906, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.2895096242427826, + "step": 1040 + }, + { + "epoch": 0.35023348899266177, + "loss": 1.6017, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "grad_norm": 1.4346905946731567, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "learning_rate": 0.0007999474813019875, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 1.5935039520263672, + "step": 1050 + }, + { + "ce_loss": 0.5919435620307922, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.5254257321357727, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.3546173572540283, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 1.3503769636154175, + "step": 1050 + }, + { + "ce_loss": 0.5056501030921936, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.45085012912750244, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.3113629221916199, + "step": 1050 + }, + { + "epoch": 0.3535690460306871, + "loss": 1.5644, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "grad_norm": 2.372422456741333, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "learning_rate": 0.000799940245572646, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 1.4446749687194824, + "step": 1060 + }, + { + "ce_loss": 0.4489143192768097, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.6644142270088196, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.22634552419185638, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 1.4582650661468506, + "step": 1060 + }, + { + "ce_loss": 0.40130969882011414, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.5357669591903687, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.3189239799976349, + "step": 1060 + }, + { + "epoch": 0.35690460306871247, + "loss": 1.5684, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "grad_norm": 1.7825205326080322, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "learning_rate": 0.0007999325430700026, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 1.6820989847183228, + "step": 1070 + }, + { + "ce_loss": 0.5786712765693665, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.608523428440094, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.40291449427604675, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 1.5814471244812012, + "step": 1070 + }, + { + "ce_loss": 0.5588512420654297, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.6993938684463501, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.31999915838241577, + "step": 1070 + }, + { + "epoch": 0.3602401601067378, + "loss": 1.6031, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "grad_norm": 2.025482177734375, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "learning_rate": 0.0007999243738030467, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 1.284332036972046, + "step": 1080 + }, + { + "ce_loss": 0.39245760440826416, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.4988771677017212, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.28330400586128235, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 1.3760321140289307, + "step": 1080 + }, + { + "ce_loss": 0.46022656559944153, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.5289139747619629, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.28009310364723206, + "step": 1080 + }, + { + "epoch": 0.36357571714476317, + "loss": 1.5647, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "grad_norm": 1.46823251247406, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "learning_rate": 0.0007999157377813131, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 1.3993898630142212, + "step": 1090 + }, + { + "ce_loss": 0.45543530583381653, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.5258496999740601, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.31005752086639404, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 1.3657909631729126, + "step": 1090 + }, + { + "ce_loss": 0.41977930068969727, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.5256907939910889, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.31721067428588867, + "step": 1090 + }, + { + "epoch": 0.3669112741827885, + "loss": 1.6111, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "grad_norm": 8.720490455627441, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "learning_rate": 0.0007999066350148808, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 1.4530463218688965, + "step": 1100 + }, + { + "ce_loss": 0.4756782054901123, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.6375422477722168, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.3396244943141937, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 1.5000605583190918, + "step": 1100 + }, + { + "ce_loss": 0.42159199714660645, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.6526952981948853, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.25416651368141174, + "step": 1100 + }, + { + "epoch": 0.37024683122081387, + "loss": 1.5306, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "grad_norm": 3.5046091079711914, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "learning_rate": 0.0007998970655143737, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 1.6367322206497192, + "step": 1110 + }, + { + "ce_loss": 0.5216576457023621, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.7075483202934265, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.2824236750602722, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 2.0098891258239746, + "step": 1110 + }, + { + "ce_loss": 0.48691219091415405, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.7101386189460754, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.26211676001548767, + "step": 1110 + }, + { + "epoch": 0.3735823882588392, + "loss": 1.6648, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "grad_norm": 1.5141630172729492, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "learning_rate": 0.0007998870292909604, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 1.2798373699188232, + "step": 1120 + }, + { + "ce_loss": 0.3349030017852783, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.49389535188674927, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.25652116537094116, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 1.777625322341919, + "step": 1120 + }, + { + "ce_loss": 0.48351800441741943, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.576139509677887, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.32994407415390015, + "step": 1120 + }, + { + "epoch": 0.37691794529686456, + "loss": 1.5918, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "grad_norm": 1.6688812971115112, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "learning_rate": 0.0007998765263563544, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 2.2887845039367676, + "step": 1130 + }, + { + "ce_loss": 0.48430436849594116, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.5467234253883362, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.3000369966030121, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 1.4230353832244873, + "step": 1130 + }, + { + "ce_loss": 0.38922080397605896, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.5107974410057068, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.2696387469768524, + "step": 1130 + }, + { + "epoch": 0.3802535023348899, + "loss": 1.615, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "grad_norm": 2.4678921699523926, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "learning_rate": 0.0007998655567228134, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 1.6331299543380737, + "step": 1140 + }, + { + "ce_loss": 0.4924895763397217, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.5652615427970886, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.3399295210838318, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 1.5238920450210571, + "step": 1140 + }, + { + "ce_loss": 0.4449857771396637, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.5468964576721191, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.31205645203590393, + "step": 1140 + }, + { + "epoch": 0.38358905937291526, + "loss": 1.4915, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "grad_norm": 1.4147001504898071, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "learning_rate": 0.0007998541204031406, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 1.9572725296020508, + "step": 1150 + }, + { + "ce_loss": 0.4233904480934143, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.5421593189239502, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.2687678337097168, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 1.4173822402954102, + "step": 1150 + }, + { + "ce_loss": 0.37679892778396606, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.531260073184967, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.25782090425491333, + "step": 1150 + }, + { + "epoch": 0.3869246164109406, + "loss": 1.6581, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "grad_norm": 1.9394618272781372, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "learning_rate": 0.0007998422174106831, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 1.4838085174560547, + "step": 1160 + }, + { + "ce_loss": 0.46089208126068115, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.4678708612918854, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.2929135859012604, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 1.532386302947998, + "step": 1160 + }, + { + "ce_loss": 0.3994285762310028, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.5381571054458618, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.26682382822036743, + "step": 1160 + }, + { + "epoch": 0.39026017344896596, + "loss": 1.6421, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "grad_norm": 1.5442798137664795, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "learning_rate": 0.0007998298477593331, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 1.495753288269043, + "step": 1170 + }, + { + "ce_loss": 0.3334011733531952, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.4416235089302063, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.2266778200864792, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 1.2930865287780762, + "step": 1170 + }, + { + "ce_loss": 0.3840572237968445, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.5391579866409302, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.2568325996398926, + "step": 1170 + }, + { + "epoch": 0.3935957304869913, + "loss": 1.6158, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "grad_norm": 2.4269447326660156, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "learning_rate": 0.0007998170114635274, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 1.4814577102661133, + "step": 1180 + }, + { + "ce_loss": 0.4510352611541748, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.5262930393218994, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.3344971835613251, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 1.261969804763794, + "step": 1180 + }, + { + "ce_loss": 0.4206009805202484, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.5127648115158081, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.3275764584541321, + "step": 1180 + }, + { + "epoch": 0.39693128752501666, + "loss": 1.5864, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "grad_norm": 1.4139463901519775, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "learning_rate": 0.0007998037085382471, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 2.3221168518066406, + "step": 1190 + }, + { + "ce_loss": 0.43068450689315796, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.6078692078590393, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.27681607007980347, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 1.539637804031372, + "step": 1190 + }, + { + "ce_loss": 0.45647120475769043, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.6491154432296753, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.2978929281234741, + "step": 1190 + }, + { + "epoch": 0.400266844563042, + "loss": 1.531, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "grad_norm": 1.8300831317901611, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "learning_rate": 0.0007997899389990183, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 1.3315988779067993, + "step": 1200 + }, + { + "ce_loss": 0.4322234094142914, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.4977712333202362, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.2532905042171478, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 1.5709469318389893, + "step": 1200 + }, + { + "ce_loss": 0.4227243661880493, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.4741690158843994, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.33167847990989685, + "step": 1200 + }, + { + "epoch": 0.40360240160106736, + "loss": 1.5606, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "grad_norm": 2.384575366973877, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "learning_rate": 0.0007997757028619115, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 1.5894966125488281, + "step": 1210 + }, + { + "ce_loss": 0.45032036304473877, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.6991457343101501, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.2911129295825958, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 1.957366704940796, + "step": 1210 + }, + { + "ce_loss": 0.4575033485889435, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.5383639335632324, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.28521963953971863, + "step": 1210 + }, + { + "epoch": 0.4069379586390927, + "loss": 1.7073, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "grad_norm": 1.6603896617889404, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "learning_rate": 0.0007997610001435419, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 1.9320708513259888, + "step": 1220 + }, + { + "ce_loss": 0.4879373610019684, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.4297964572906494, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.36906328797340393, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 1.1487407684326172, + "step": 1220 + }, + { + "ce_loss": 0.41289323568344116, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.41345977783203125, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.3178797960281372, + "step": 1220 + }, + { + "epoch": 0.41027351567711806, + "loss": 1.5026, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "grad_norm": 1.7940484285354614, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "learning_rate": 0.000799745830861069, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 1.5663365125656128, + "step": 1230 + }, + { + "ce_loss": 0.4955677092075348, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.6290001273155212, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.33181193470954895, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 1.218145489692688, + "step": 1230 + }, + { + "ce_loss": 0.37379351258277893, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.48004651069641113, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.27316486835479736, + "step": 1230 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.5787, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "grad_norm": 2.2292141914367676, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "learning_rate": 0.0007997301950321971, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.3395863771438599, + "step": 1240 + }, + { + "ce_loss": 0.3790244460105896, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.47209668159484863, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.27871784567832947, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 2.0479073524475098, + "step": 1240 + }, + { + "ce_loss": 0.39347025752067566, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.5617333054542542, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.26081743836402893, + "step": 1240 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.5392, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "grad_norm": 1.818170428276062, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "learning_rate": 0.0007997140926751748, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.4717570543289185, + "step": 1250 + }, + { + "ce_loss": 0.5022345781326294, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.5199416279792786, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.3118363916873932, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.484877347946167, + "step": 1250 + }, + { + "ce_loss": 0.4070572257041931, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.48654651641845703, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.34332475066185, + "step": 1250 + }, + { + "epoch": 0.4202801867911941, + "loss": 1.655, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "grad_norm": 3.15238881111145, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "learning_rate": 0.0007996975238087954, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 1.3359380960464478, + "step": 1260 + }, + { + "ce_loss": 0.38020819425582886, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.5758382081985474, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.27918919920921326, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 1.6192512512207031, + "step": 1260 + }, + { + "ce_loss": 0.5164095759391785, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.6622616052627563, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.44050195813179016, + "step": 1260 + }, + { + "epoch": 0.42361574382921946, + "loss": 1.6418, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "grad_norm": 4.3235039710998535, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "learning_rate": 0.0007996804884523964, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 1.345386266708374, + "step": 1270 + }, + { + "ce_loss": 0.4195159375667572, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.6241512298583984, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.3016016185283661, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 1.4211335182189941, + "step": 1270 + }, + { + "ce_loss": 0.4531877338886261, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.6978771090507507, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.2688674032688141, + "step": 1270 + }, + { + "epoch": 0.4269513008672448, + "loss": 1.5536, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "grad_norm": 1.699332594871521, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "learning_rate": 0.00079966298662586, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 2.720428228378296, + "step": 1280 + }, + { + "ce_loss": 0.4882067143917084, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.5594663619995117, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.3512772023677826, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 1.3232649564743042, + "step": 1280 + }, + { + "ce_loss": 0.39429083466529846, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.5630033612251282, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.2832672894001007, + "step": 1280 + }, + { + "epoch": 0.43028685790527016, + "loss": 1.5439, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "grad_norm": 1.461922287940979, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "learning_rate": 0.0007996450183496126, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 1.575237512588501, + "step": 1290 + }, + { + "ce_loss": 0.505817174911499, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.6163088083267212, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.2900249660015106, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 1.3824915885925293, + "step": 1290 + }, + { + "ce_loss": 0.4555290639400482, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.6401718258857727, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.2867133617401123, + "step": 1290 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.5352, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "grad_norm": 4.11077356338501, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "learning_rate": 0.0007996265836446254, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.5897566080093384, + "step": 1300 + }, + { + "ce_loss": 0.3810420334339142, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.5214735269546509, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.2401161789894104, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.3515366315841675, + "step": 1300 + }, + { + "ce_loss": 0.3965347409248352, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.5576469898223877, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.28013500571250916, + "step": 1300 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.5448, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "grad_norm": 1.6120764017105103, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "learning_rate": 0.0007996076825324133, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.7771337032318115, + "step": 1310 + }, + { + "ce_loss": 0.43278172612190247, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.4835440218448639, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.28642958402633667, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.9378416538238525, + "step": 1310 + }, + { + "ce_loss": 0.49280300736427307, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.5592789649963379, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.2844145596027374, + "step": 1310 + }, + { + "epoch": 0.4402935290193462, + "loss": 1.6367, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "grad_norm": 3.2773890495300293, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "learning_rate": 0.0007995883150350363, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 1.526888132095337, + "step": 1320 + }, + { + "ce_loss": 0.43740856647491455, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.579285204410553, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.27087196707725525, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 1.2348921298980713, + "step": 1320 + }, + { + "ce_loss": 0.43266892433166504, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.5434672832489014, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.258668452501297, + "step": 1320 + }, + { + "epoch": 0.44362908605737156, + "loss": 1.5593, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "grad_norm": 1.6326991319656372, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "learning_rate": 0.000799568481175098, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 1.2025867700576782, + "step": 1330 + }, + { + "ce_loss": 0.3716997802257538, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.4663502871990204, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.32629045844078064, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 1.7404615879058838, + "step": 1330 + }, + { + "ce_loss": 0.39141565561294556, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.570256769657135, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.2961578965187073, + "step": 1330 + }, + { + "epoch": 0.4469646430953969, + "loss": 1.5984, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "grad_norm": 1.4796324968338013, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "learning_rate": 0.000799548180975747, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 2.8081512451171875, + "step": 1340 + }, + { + "ce_loss": 0.47508761286735535, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.5701019167900085, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.2469950169324875, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 1.1866693496704102, + "step": 1340 + }, + { + "ce_loss": 0.37990590929985046, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.4495466947555542, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.2721485197544098, + "step": 1340 + }, + { + "epoch": 0.45030020013342226, + "loss": 1.6328, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "grad_norm": 1.5720679759979248, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "learning_rate": 0.0007995274144606755, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 1.2863354682922363, + "step": 1350 + }, + { + "ce_loss": 0.3292873799800873, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.5815794467926025, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.2648515999317169, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 1.2642462253570557, + "step": 1350 + }, + { + "ce_loss": 0.3768036365509033, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.5950371026992798, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.29059046506881714, + "step": 1350 + }, + { + "epoch": 0.4536357571714476, + "loss": 1.4408, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "grad_norm": 2.709803819656372, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "learning_rate": 0.0007995061816541204, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 1.3702350854873657, + "step": 1360 + }, + { + "ce_loss": 0.44703322649002075, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.5962932705879211, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.32672789692878723, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 1.504996418952942, + "step": 1360 + }, + { + "ce_loss": 0.43612051010131836, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.6245546936988831, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.24796253442764282, + "step": 1360 + }, + { + "epoch": 0.45697131420947296, + "loss": 1.4248, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "grad_norm": 1.779395580291748, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "learning_rate": 0.0007994844825808628, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 1.311972975730896, + "step": 1370 + }, + { + "ce_loss": 0.4517594575881958, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.4873427748680115, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.29018694162368774, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 1.8570822477340698, + "step": 1370 + }, + { + "ce_loss": 0.48192328214645386, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.5890644788742065, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.3222361207008362, + "step": 1370 + }, + { + "epoch": 0.4603068712474983, + "loss": 1.4956, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "grad_norm": 1.5447510480880737, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "learning_rate": 0.0007994623172662275, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 1.429722785949707, + "step": 1380 + }, + { + "ce_loss": 0.4304126501083374, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.5462210178375244, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.28710076212882996, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 1.4016376733779907, + "step": 1380 + }, + { + "ce_loss": 0.3747934103012085, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.5976477265357971, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.30236348509788513, + "step": 1380 + }, + { + "epoch": 0.46364242828552366, + "loss": 1.5523, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "grad_norm": 1.9847055673599243, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "learning_rate": 0.0007994396857360842, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 1.348907709121704, + "step": 1390 + }, + { + "ce_loss": 0.4562534987926483, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.6037919521331787, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.28767332434654236, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 2.5337958335876465, + "step": 1390 + }, + { + "ce_loss": 0.42729249596595764, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.6115925312042236, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.2527952492237091, + "step": 1390 + }, + { + "epoch": 0.466977985323549, + "loss": 1.6165, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "grad_norm": 2.61089825630188, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "learning_rate": 0.0007994165880168461, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 1.4341270923614502, + "step": 1400 + }, + { + "ce_loss": 0.4411405622959137, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.5347835421562195, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.3186250627040863, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 1.515146255493164, + "step": 1400 + }, + { + "ce_loss": 0.42187491059303284, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.5460115075111389, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.24013651907444, + "step": 1400 + }, + { + "epoch": 0.4703135423615744, + "loss": 1.6537, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "grad_norm": 2.4364840984344482, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "learning_rate": 0.0007993930241354708, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 1.3404841423034668, + "step": 1410 + }, + { + "ce_loss": 0.4299369752407074, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.5847266316413879, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.3256117105484009, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 1.4295798540115356, + "step": 1410 + }, + { + "ce_loss": 0.46195846796035767, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.5672675967216492, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.2844446003437042, + "step": 1410 + }, + { + "epoch": 0.47364909939959976, + "loss": 1.538, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "grad_norm": 2.079164981842041, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "learning_rate": 0.0007993689941194598, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 1.8757567405700684, + "step": 1420 + }, + { + "ce_loss": 0.49489399790763855, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.5338597297668457, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.3119715750217438, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 1.5323330163955688, + "step": 1420 + }, + { + "ce_loss": 0.4838011562824249, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.5049943923950195, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.377859503030777, + "step": 1420 + }, + { + "epoch": 0.4769846564376251, + "loss": 1.5542, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "grad_norm": 1.6398972272872925, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "learning_rate": 0.0007993444979968588, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 1.2284057140350342, + "step": 1430 + }, + { + "ce_loss": 0.4466082751750946, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.5416300296783447, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.24010510742664337, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 1.525360345840454, + "step": 1430 + }, + { + "ce_loss": 0.48976385593414307, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.5415977835655212, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.35288766026496887, + "step": 1430 + }, + { + "epoch": 0.48032021347565046, + "loss": 1.5164, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "grad_norm": 1.6208374500274658, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "learning_rate": 0.0007993195357962575, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 1.698353886604309, + "step": 1440 + }, + { + "ce_loss": 0.4378069341182709, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.6145537495613098, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.2727295756340027, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 1.3916555643081665, + "step": 1440 + }, + { + "ce_loss": 0.3999626040458679, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.5818359851837158, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.29204344749450684, + "step": 1440 + }, + { + "epoch": 0.4836557705136758, + "loss": 1.5253, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "grad_norm": 3.18560528755188, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "learning_rate": 0.0007992941075467892, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 1.3466286659240723, + "step": 1450 + }, + { + "ce_loss": 0.35075056552886963, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.42663854360580444, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.2799513041973114, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 1.619462490081787, + "step": 1450 + }, + { + "ce_loss": 0.5010478496551514, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.5300058126449585, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.3170461356639862, + "step": 1450 + }, + { + "epoch": 0.48699132755170116, + "loss": 1.6154, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "grad_norm": 1.907476782798767, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "learning_rate": 0.0007992682132781317, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 1.7880961894989014, + "step": 1460 + }, + { + "ce_loss": 0.44775262475013733, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.5088525414466858, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.2713979184627533, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 1.7701867818832397, + "step": 1460 + }, + { + "ce_loss": 0.49875637888908386, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.6390624642372131, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.2926272451877594, + "step": 1460 + }, + { + "epoch": 0.4903268845897265, + "loss": 1.5046, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "grad_norm": 1.6721819639205933, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "learning_rate": 0.0007992418530205062, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 1.2131767272949219, + "step": 1470 + }, + { + "ce_loss": 0.38331499695777893, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.47394928336143494, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.26265010237693787, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 1.0815149545669556, + "step": 1470 + }, + { + "ce_loss": 0.3649250566959381, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.48027974367141724, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.23579445481300354, + "step": 1470 + }, + { + "epoch": 0.49366244162775186, + "loss": 1.4974, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "grad_norm": 2.451059579849243, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "learning_rate": 0.000799215026804678, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 2.313891887664795, + "step": 1480 + }, + { + "ce_loss": 0.46999824047088623, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.5875487327575684, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.3389456868171692, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 1.3807448148727417, + "step": 1480 + }, + { + "ce_loss": 0.4419082999229431, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.5672412514686584, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.2629389762878418, + "step": 1480 + }, + { + "epoch": 0.4969979986657772, + "loss": 1.4781, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "grad_norm": 1.9855118989944458, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "learning_rate": 0.0007991877346619562, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 1.5322380065917969, + "step": 1490 + }, + { + "ce_loss": 0.4553185701370239, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.49299630522727966, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.2738795876502991, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 1.3188645839691162, + "step": 1490 + }, + { + "ce_loss": 0.43871697783470154, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.5200066566467285, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.3599816858768463, + "step": 1490 + }, + { + "epoch": 0.5003335557038026, + "loss": 1.4846, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "grad_norm": 2.409991502761841, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "learning_rate": 0.0007991599766241939, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 1.4143189191818237, + "step": 1500 + }, + { + "ce_loss": 0.3943651020526886, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.4304821491241455, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.2800663113594055, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 1.6676888465881348, + "step": 1500 + }, + { + "ce_loss": 0.4340650141239166, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.5943726897239685, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.2812447249889374, + "step": 1500 + }, + { + "epoch": 0.5036691127418279, + "loss": 1.5091, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "grad_norm": 1.7553213834762573, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "learning_rate": 0.0007991317527237872, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 1.3898941278457642, + "step": 1510 + }, + { + "ce_loss": 0.4156953990459442, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.5571177005767822, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.2963123619556427, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 1.3989025354385376, + "step": 1510 + }, + { + "ce_loss": 0.44018229842185974, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.5224853157997131, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.3302108943462372, + "step": 1510 + }, + { + "epoch": 0.5070046697798533, + "loss": 1.3539, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "grad_norm": 1.661407232284546, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "learning_rate": 0.0007991030629936768, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 1.38296377658844, + "step": 1520 + }, + { + "ce_loss": 0.35517755150794983, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.4815816283226013, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.22595949470996857, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 1.6152260303497314, + "step": 1520 + }, + { + "ce_loss": 0.5425209403038025, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.6551669836044312, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.3061394989490509, + "step": 1520 + }, + { + "epoch": 0.5103402268178786, + "loss": 1.423, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "grad_norm": 1.6371610164642334, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "learning_rate": 0.0007990739074673468, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 1.2464088201522827, + "step": 1530 + }, + { + "ce_loss": 0.39948147535324097, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.45707568526268005, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.2859954833984375, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 1.2391449213027954, + "step": 1530 + }, + { + "ce_loss": 0.4184815585613251, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.5067058801651001, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.31185799837112427, + "step": 1530 + }, + { + "epoch": 0.513675783855904, + "loss": 1.5512, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "grad_norm": 2.0008840560913086, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "learning_rate": 0.0007990442861788244, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 1.412217378616333, + "step": 1540 + }, + { + "ce_loss": 0.4328736960887909, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.5061045289039612, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.35407111048698425, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 1.3271783590316772, + "step": 1540 + }, + { + "ce_loss": 0.3384850323200226, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.4997933506965637, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.31482452154159546, + "step": 1540 + }, + { + "epoch": 0.5170113408939293, + "loss": 1.5203, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "grad_norm": 2.0364885330200195, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "learning_rate": 0.0007990141991626813, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 1.3766908645629883, + "step": 1550 + }, + { + "ce_loss": 0.4917585253715515, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.5370101928710938, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.2637538015842438, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 1.4567843675613403, + "step": 1550 + }, + { + "ce_loss": 0.5104137063026428, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.49222514033317566, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.3065930902957916, + "step": 1550 + }, + { + "epoch": 0.5203468979319547, + "loss": 1.5151, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "grad_norm": 2.2353527545928955, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "learning_rate": 0.0007989836464540318, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 1.3307873010635376, + "step": 1560 + }, + { + "ce_loss": 0.3729870617389679, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.6389895677566528, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.27055856585502625, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 1.5160497426986694, + "step": 1560 + }, + { + "ce_loss": 0.47393983602523804, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.6437463164329529, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.31374311447143555, + "step": 1560 + }, + { + "epoch": 0.52368245496998, + "loss": 1.4511, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "grad_norm": 2.509186267852783, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "learning_rate": 0.0007989526280885348, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 1.4282000064849854, + "step": 1570 + }, + { + "ce_loss": 0.3524983823299408, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.5033807158470154, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.21685640513896942, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 2.016537666320801, + "step": 1570 + }, + { + "ce_loss": 0.46554210782051086, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.5887246131896973, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.3299577832221985, + "step": 1570 + }, + { + "epoch": 0.5270180120080054, + "loss": 1.4662, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "grad_norm": 1.5988291501998901, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "learning_rate": 0.0007989211441023914, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 1.3795303106307983, + "step": 1580 + }, + { + "ce_loss": 0.31225863099098206, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.6102480888366699, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.24404440820217133, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 1.2649134397506714, + "step": 1580 + }, + { + "ce_loss": 0.3859570026397705, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.5082134008407593, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.2923862934112549, + "step": 1580 + }, + { + "epoch": 0.5303535690460307, + "loss": 1.3763, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "grad_norm": 1.6285555362701416, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "learning_rate": 0.0007988891945323474, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 1.705237865447998, + "step": 1590 + }, + { + "ce_loss": 0.41622549295425415, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.5183006525039673, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.29350215196609497, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 1.2734066247940063, + "step": 1590 + }, + { + "ce_loss": 0.44327646493911743, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.5529025793075562, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.27696895599365234, + "step": 1590 + }, + { + "epoch": 0.533689126084056, + "loss": 1.4172, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "grad_norm": 1.6855443716049194, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "learning_rate": 0.000798856779415691, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 1.2596999406814575, + "step": 1600 + }, + { + "ce_loss": 0.40576714277267456, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.5716956853866577, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.2820309102535248, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 1.3510174751281738, + "step": 1600 + }, + { + "ce_loss": 0.4361741542816162, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.5999810099601746, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.24159260094165802, + "step": 1600 + }, + { + "epoch": 0.5370246831220814, + "loss": 1.5089, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "grad_norm": 1.5545297861099243, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "learning_rate": 0.0007988238987902543, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 1.9645248651504517, + "step": 1610 + }, + { + "ce_loss": 0.4018411934375763, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.44524702429771423, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.33014345169067383, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 1.085498332977295, + "step": 1610 + }, + { + "ce_loss": 0.3779771327972412, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.39916783571243286, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.3057388663291931, + "step": 1610 + }, + { + "epoch": 0.5403602401601068, + "loss": 1.5661, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "grad_norm": 2.2747015953063965, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "learning_rate": 0.0007987905526944125, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 1.3721652030944824, + "step": 1620 + }, + { + "ce_loss": 0.4520910382270813, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.607236385345459, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.3124680817127228, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 1.5054634809494019, + "step": 1620 + }, + { + "ce_loss": 0.4722904562950134, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.5852570533752441, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.2933380603790283, + "step": 1620 + }, + { + "epoch": 0.5436957971981321, + "loss": 1.3952, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "grad_norm": 1.8939306735992432, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "learning_rate": 0.000798756741167084, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 1.1996138095855713, + "step": 1630 + }, + { + "ce_loss": 0.3728450536727905, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.4565788507461548, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.24662546813488007, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 1.341873288154602, + "step": 1630 + }, + { + "ce_loss": 0.4312497079372406, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.52656090259552, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.2624252736568451, + "step": 1630 + }, + { + "epoch": 0.5470313542361575, + "loss": 1.407, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "grad_norm": 1.7016750574111938, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "learning_rate": 0.0007987224642477307, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 1.5102639198303223, + "step": 1640 + }, + { + "ce_loss": 0.4606504440307617, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.6021109819412231, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.2885037958621979, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 2.82285213470459, + "step": 1640 + }, + { + "ce_loss": 0.41885632276535034, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.5302011370658875, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.3432007431983948, + "step": 1640 + }, + { + "epoch": 0.5503669112741828, + "loss": 1.5286, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "grad_norm": 1.6959972381591797, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "learning_rate": 0.0007986877219763572, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 1.4738940000534058, + "step": 1650 + }, + { + "ce_loss": 0.40843141078948975, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.49939486384391785, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.24556097388267517, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 1.0911322832107544, + "step": 1650 + }, + { + "ce_loss": 0.3265611231327057, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.4817613363265991, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.18119016289710999, + "step": 1650 + }, + { + "epoch": 0.5537024683122082, + "loss": 1.4085, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "grad_norm": 1.3222358226776123, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "learning_rate": 0.0007986525143935115, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 1.2532429695129395, + "step": 1660 + }, + { + "ce_loss": 0.45698484778404236, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.5718897581100464, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.2241760790348053, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 1.4703891277313232, + "step": 1660 + }, + { + "ce_loss": 0.44330742955207825, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.5229434967041016, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.2593224048614502, + "step": 1660 + }, + { + "epoch": 0.5570380253502335, + "loss": 1.4088, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "grad_norm": 1.3066946268081665, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "learning_rate": 0.000798616841540285, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 1.1286921501159668, + "step": 1670 + }, + { + "ce_loss": 0.3801676034927368, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.40562373399734497, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.2621326148509979, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 1.7087914943695068, + "step": 1670 + }, + { + "ce_loss": 0.4122104346752167, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.5846372842788696, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.2881487011909485, + "step": 1670 + }, + { + "epoch": 0.5603735823882589, + "loss": 1.5027, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "grad_norm": 2.6256797313690186, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "learning_rate": 0.0007985807034583111, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 1.4386684894561768, + "step": 1680 + }, + { + "ce_loss": 0.4299916923046112, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.6190658807754517, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.30958718061447144, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 1.4426429271697998, + "step": 1680 + }, + { + "ce_loss": 0.4818243086338043, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.5826115608215332, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.29600685834884644, + "step": 1680 + }, + { + "epoch": 0.5637091394262842, + "loss": 1.4569, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "grad_norm": 2.3843090534210205, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "learning_rate": 0.0007985441001897675, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 2.2765228748321533, + "step": 1690 + }, + { + "ce_loss": 0.423153817653656, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.4943941533565521, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.23956818878650665, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 1.485368251800537, + "step": 1690 + }, + { + "ce_loss": 0.3642539083957672, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.39737266302108765, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.2661179006099701, + "step": 1690 + }, + { + "epoch": 0.5670446964643095, + "loss": 1.507, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "grad_norm": 1.6635305881500244, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "learning_rate": 0.0007985070317773737, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 1.5071004629135132, + "step": 1700 + }, + { + "ce_loss": 0.3156302571296692, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.4929995536804199, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.24376991391181946, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 1.4015324115753174, + "step": 1700 + }, + { + "ce_loss": 0.4085094928741455, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.559135913848877, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.28080520033836365, + "step": 1700 + }, + { + "epoch": 0.5703802535023349, + "loss": 1.384, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "grad_norm": 1.3340486288070679, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "learning_rate": 0.0007984694982643927, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 1.5864113569259644, + "step": 1710 + }, + { + "ce_loss": 0.40390101075172424, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.5704938173294067, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.2445508986711502, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 1.1265581846237183, + "step": 1710 + }, + { + "ce_loss": 0.3849276304244995, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.5128688812255859, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.22824981808662415, + "step": 1710 + }, + { + "epoch": 0.5737158105403602, + "loss": 1.4421, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "grad_norm": 1.9246883392333984, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "learning_rate": 0.0007984314996946303, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 1.3969639539718628, + "step": 1720 + }, + { + "ce_loss": 0.4116149842739105, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.5273759961128235, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.2239617109298706, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 1.1578283309936523, + "step": 1720 + }, + { + "ce_loss": 0.4013277292251587, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.4725862145423889, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.2722586989402771, + "step": 1720 + }, + { + "epoch": 0.5770513675783856, + "loss": 1.5764, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "grad_norm": 2.31512188911438, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "learning_rate": 0.0007983930361124345, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 1.5932635068893433, + "step": 1730 + }, + { + "ce_loss": 0.41283050179481506, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.5362582802772522, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.29913270473480225, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 1.472170114517212, + "step": 1730 + }, + { + "ce_loss": 0.41566747426986694, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.5674774050712585, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.2518981099128723, + "step": 1730 + }, + { + "epoch": 0.580386924616411, + "loss": 1.4327, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "grad_norm": 1.5882205963134766, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "learning_rate": 0.0007983541075626968, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 1.265290379524231, + "step": 1740 + }, + { + "ce_loss": 0.3291006088256836, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.5921589136123657, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.2453921139240265, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 1.5062141418457031, + "step": 1740 + }, + { + "ce_loss": 0.4185219407081604, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.5556906461715698, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.30186644196510315, + "step": 1740 + }, + { + "epoch": 0.5837224816544363, + "loss": 1.4242, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "grad_norm": 1.4376288652420044, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "learning_rate": 0.000798314714090851, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 1.2823268175125122, + "step": 1750 + }, + { + "ce_loss": 0.37645918130874634, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.4971107542514801, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.28914013504981995, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 1.4173918962478638, + "step": 1750 + }, + { + "ce_loss": 0.4171852767467499, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.5723685026168823, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.3079443871974945, + "step": 1750 + }, + { + "epoch": 0.5870580386924616, + "loss": 1.4064, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "grad_norm": 1.7479711771011353, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "learning_rate": 0.0007982748557428733, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 1.8940140008926392, + "step": 1760 + }, + { + "ce_loss": 0.4125586152076721, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.5978971123695374, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.21934141218662262, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 1.5041753053665161, + "step": 1760 + }, + { + "ce_loss": 0.45310303568840027, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.599409818649292, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.24095265567302704, + "step": 1760 + }, + { + "epoch": 0.590393595730487, + "loss": 1.5753, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "grad_norm": 3.1749627590179443, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "learning_rate": 0.0007982345325652828, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 1.2013965845108032, + "step": 1770 + }, + { + "ce_loss": 0.3765731453895569, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.5124651789665222, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.24479584395885468, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 1.5885907411575317, + "step": 1770 + }, + { + "ce_loss": 0.46473753452301025, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.5773957967758179, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.24250906705856323, + "step": 1770 + }, + { + "epoch": 0.5937291527685123, + "loss": 1.4574, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "grad_norm": 1.832212209701538, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "learning_rate": 0.0007981937446051412, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 1.673588752746582, + "step": 1780 + }, + { + "ce_loss": 0.3560979962348938, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.47634315490722656, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.23794607818126678, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 2.1981008052825928, + "step": 1780 + }, + { + "ce_loss": 0.40152403712272644, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.5010473728179932, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.3065120577812195, + "step": 1780 + }, + { + "epoch": 0.5970647098065377, + "loss": 1.4009, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "grad_norm": 1.4549444913864136, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "learning_rate": 0.0007981524919100519, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 1.4824278354644775, + "step": 1790 + }, + { + "ce_loss": 0.4323318302631378, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.6305601596832275, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.26681867241859436, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 1.265123963356018, + "step": 1790 + }, + { + "ce_loss": 0.44419583678245544, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.5172229409217834, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.3028353452682495, + "step": 1790 + }, + { + "epoch": 0.600400266844563, + "loss": 1.4739, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "grad_norm": 1.548317551612854, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "learning_rate": 0.0007981107745281618, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 1.3120996952056885, + "step": 1800 + }, + { + "ce_loss": 0.3586893379688263, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.6148962378501892, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.2554745078086853, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 1.297042727470398, + "step": 1800 + }, + { + "ce_loss": 0.41965043544769287, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.5431655049324036, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.23221886157989502, + "step": 1800 + }, + { + "epoch": 0.6037358238825884, + "loss": 1.4416, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "grad_norm": 2.4861154556274414, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "learning_rate": 0.0007980685925081592, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 1.3630973100662231, + "step": 1810 + }, + { + "ce_loss": 0.3965637981891632, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.5065696835517883, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.2848835587501526, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 1.27780282497406, + "step": 1810 + }, + { + "ce_loss": 0.4385015368461609, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.5412667393684387, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.2979799807071686, + "step": 1810 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.3629, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "grad_norm": 1.543222427368164, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "learning_rate": 0.0007980259458992752, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.8371226787567139, + "step": 1820 + }, + { + "ce_loss": 0.384300172328949, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.5954495668411255, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.26500630378723145, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.5840888023376465, + "step": 1820 + }, + { + "ce_loss": 0.48104703426361084, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.47974202036857605, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.31398510932922363, + "step": 1820 + }, + { + "epoch": 0.6104069379586391, + "loss": 1.4797, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "grad_norm": 2.317809581756592, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "learning_rate": 0.0007979828347512831, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 1.9281671047210693, + "step": 1830 + }, + { + "ce_loss": 0.4327715039253235, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.5266700983047485, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.3022516965866089, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 1.3074755668640137, + "step": 1830 + }, + { + "ce_loss": 0.41616860032081604, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.5696160197257996, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.32133573293685913, + "step": 1830 + }, + { + "epoch": 0.6137424949966644, + "loss": 1.4541, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "grad_norm": 1.9975054264068604, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "learning_rate": 0.000797939259114498, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 1.5424506664276123, + "step": 1840 + }, + { + "ce_loss": 0.4475153088569641, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.5498025417327881, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.28382542729377747, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 1.2897108793258667, + "step": 1840 + }, + { + "ce_loss": 0.42207860946655273, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.5946893692016602, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.27271324396133423, + "step": 1840 + }, + { + "epoch": 0.6170780520346898, + "loss": 1.5766, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "grad_norm": 1.6964223384857178, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "learning_rate": 0.0007978952190397774, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 1.7818580865859985, + "step": 1850 + }, + { + "ce_loss": 0.4346107542514801, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.5910945534706116, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.30669838190078735, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 1.18301260471344, + "step": 1850 + }, + { + "ce_loss": 0.4067380130290985, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.5019524097442627, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.2735179662704468, + "step": 1850 + }, + { + "epoch": 0.6204136090727151, + "loss": 1.4075, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "grad_norm": 1.5153988599777222, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "learning_rate": 0.000797850714578521, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 1.3955501317977905, + "step": 1860 + }, + { + "ce_loss": 0.39396870136260986, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.5631558895111084, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.3130049705505371, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 2.127243995666504, + "step": 1860 + }, + { + "ce_loss": 0.41338804364204407, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.5591651201248169, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.29337289929389954, + "step": 1860 + }, + { + "epoch": 0.6237491661107405, + "loss": 1.5559, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "grad_norm": 1.728464961051941, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "learning_rate": 0.0007978057457826702, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 1.0223857164382935, + "step": 1870 + }, + { + "ce_loss": 0.31188276410102844, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.4333324432373047, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.18637055158615112, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 1.4168239831924438, + "step": 1870 + }, + { + "ce_loss": 0.45807886123657227, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.5271170139312744, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.3137466013431549, + "step": 1870 + }, + { + "epoch": 0.6270847231487658, + "loss": 1.4226, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "grad_norm": 2.060706853866577, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "learning_rate": 0.0007977603127047084, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 1.2071475982666016, + "step": 1880 + }, + { + "ce_loss": 0.3951813280582428, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.504381000995636, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.3060469329357147, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 1.1786348819732666, + "step": 1880 + }, + { + "ce_loss": 0.3081275522708893, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.5031088590621948, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.2622256577014923, + "step": 1880 + }, + { + "epoch": 0.6304202801867912, + "loss": 1.4745, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "grad_norm": 3.1443939208984375, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "learning_rate": 0.0007977144153976608, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 1.4169859886169434, + "step": 1890 + }, + { + "ce_loss": 0.4933745265007019, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.5657783150672913, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.25525254011154175, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 1.5357770919799805, + "step": 1890 + }, + { + "ce_loss": 0.31528544425964355, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.5158506631851196, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.19878283143043518, + "step": 1890 + }, + { + "epoch": 0.6337558372248165, + "loss": 1.3136, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "grad_norm": 1.7846509218215942, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "learning_rate": 0.0007976680539150947, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 1.2587743997573853, + "step": 1900 + }, + { + "ce_loss": 0.40194782614707947, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.5084041357040405, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.25532692670822144, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 1.2736737728118896, + "step": 1900 + }, + { + "ce_loss": 0.45652300119400024, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.5367364883422852, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.28010380268096924, + "step": 1900 + }, + { + "epoch": 0.6370913942628419, + "loss": 1.5951, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "grad_norm": 2.80888032913208, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "learning_rate": 0.0007976212283111187, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 1.9592828750610352, + "step": 1910 + }, + { + "ce_loss": 0.416324645280838, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.6871696710586548, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.3213195502758026, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 1.4825496673583984, + "step": 1910 + }, + { + "ce_loss": 0.4488929510116577, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.6601699590682983, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.294272780418396, + "step": 1910 + }, + { + "epoch": 0.6404269513008672, + "loss": 1.5307, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "grad_norm": 1.9157191514968872, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "learning_rate": 0.0007975739386403835, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 1.4704571962356567, + "step": 1920 + }, + { + "ce_loss": 0.4426376521587372, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.6234824061393738, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.32547807693481445, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 2.2691290378570557, + "step": 1920 + }, + { + "ce_loss": 0.46476539969444275, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.6577616333961487, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.30368247628211975, + "step": 1920 + }, + { + "epoch": 0.6437625083388926, + "loss": 1.5061, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "grad_norm": 2.470832109451294, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "learning_rate": 0.0007975261849580813, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 1.0962388515472412, + "step": 1930 + }, + { + "ce_loss": 0.33751609921455383, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.5582593679428101, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.20032666623592377, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 1.4692728519439697, + "step": 1930 + }, + { + "ce_loss": 0.39003822207450867, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.5492802858352661, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.26024729013442993, + "step": 1930 + }, + { + "epoch": 0.6470980653769179, + "loss": 1.4432, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "grad_norm": 1.8819775581359863, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "learning_rate": 0.0007974779673199456, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 1.2645516395568848, + "step": 1940 + }, + { + "ce_loss": 0.40579256415367126, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.4985727369785309, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.25948283076286316, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 1.2759686708450317, + "step": 1940 + }, + { + "ce_loss": 0.3738131821155548, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.46578264236450195, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.2944307029247284, + "step": 1940 + }, + { + "epoch": 0.6504336224149433, + "loss": 1.3298, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "grad_norm": 1.5547473430633545, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "learning_rate": 0.0007974292857822515, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 1.2107421159744263, + "step": 1950 + }, + { + "ce_loss": 0.4242013096809387, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.497577965259552, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.2884838879108429, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 1.2990264892578125, + "step": 1950 + }, + { + "ce_loss": 0.38576236367225647, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.5003119707107544, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.27664613723754883, + "step": 1950 + }, + { + "epoch": 0.6537691794529686, + "loss": 1.3588, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "grad_norm": 1.5756919384002686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "learning_rate": 0.0007973801404018158, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 1.676820158958435, + "step": 1960 + }, + { + "ce_loss": 0.45319217443466187, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.6395261883735657, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.28354665637016296, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 1.5755594968795776, + "step": 1960 + }, + { + "ce_loss": 0.35511845350265503, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.5487231016159058, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.25075337290763855, + "step": 1960 + }, + { + "epoch": 0.657104736490994, + "loss": 1.4525, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "grad_norm": 1.6834571361541748, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "learning_rate": 0.0007973305312359964, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 1.1696513891220093, + "step": 1970 + }, + { + "ce_loss": 0.4126241207122803, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.538031816482544, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.21698075532913208, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 2.0540080070495605, + "step": 1970 + }, + { + "ce_loss": 0.5341470837593079, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.6686353087425232, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.27718207240104675, + "step": 1970 + }, + { + "epoch": 0.6604402935290193, + "loss": 1.4011, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "grad_norm": 1.7924296855926514, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "learning_rate": 0.0007972804583426926, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 1.3130066394805908, + "step": 1980 + }, + { + "ce_loss": 0.38297760486602783, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.536170244216919, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.28933531045913696, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 1.7251466512680054, + "step": 1980 + }, + { + "ce_loss": 0.3758784532546997, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.5668622255325317, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.23983514308929443, + "step": 1980 + }, + { + "epoch": 0.6637758505670447, + "loss": 1.3723, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "grad_norm": 2.775648832321167, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "learning_rate": 0.0007972299217803446, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 1.8384811878204346, + "step": 1990 + }, + { + "ce_loss": 0.4294074773788452, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.5050294399261475, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.25213590264320374, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 1.1137229204177856, + "step": 1990 + }, + { + "ce_loss": 0.3956092596054077, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.4502190947532654, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.26725438237190247, + "step": 1990 + }, + { + "epoch": 0.66711140760507, + "loss": 1.3661, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "grad_norm": 1.7431342601776123, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "learning_rate": 0.0007971789216079343, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.351794719696045, + "step": 2000 + }, + { + "ce_loss": 0.38261255621910095, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 0.5665090084075928, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.2890423834323883, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.8742589950561523, + "step": 2000 + }, + { + "ce_loss": 0.4540679454803467, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 0.4872954487800598, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.27717703580856323, + "step": 2000 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.3747, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "grad_norm": 1.655342698097229, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "learning_rate": 0.0007971274578849843, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.5908936262130737, + "step": 2010 + }, + { + "ce_loss": 0.29928529262542725, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.5743173956871033, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.2413429170846939, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.7414389848709106, + "step": 2010 + }, + { + "ce_loss": 0.3730257451534271, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.5815277099609375, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.2591405510902405, + "step": 2010 + }, + { + "epoch": 0.6737825216811207, + "loss": 1.5379, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "grad_norm": 2.2649941444396973, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "learning_rate": 0.0007970755306715582, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 1.2894701957702637, + "step": 2020 + }, + { + "ce_loss": 0.4230029582977295, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.5699641108512878, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.29636257886886597, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 1.218937635421753, + "step": 2020 + }, + { + "ce_loss": 0.3319122791290283, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.4558425545692444, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.2575785219669342, + "step": 2020 + }, + { + "epoch": 0.6771180787191461, + "loss": 1.3827, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "grad_norm": 3.790936231613159, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "learning_rate": 0.0007970231400282608, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 1.5204871892929077, + "step": 2030 + }, + { + "ce_loss": 0.41601571440696716, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.5780538320541382, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.295123815536499, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 1.165385365486145, + "step": 2030 + }, + { + "ce_loss": 0.3634999394416809, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.5556265115737915, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.2461409717798233, + "step": 2030 + }, + { + "epoch": 0.6804536357571714, + "loss": 1.357, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "grad_norm": 1.3283863067626953, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "learning_rate": 0.0007969702860162373, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 1.5245931148529053, + "step": 2040 + }, + { + "ce_loss": 0.44471654295921326, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.5222198963165283, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.34963709115982056, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 1.142379641532898, + "step": 2040 + }, + { + "ce_loss": 0.33912596106529236, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.5050538778305054, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.22374626994132996, + "step": 2040 + }, + { + "epoch": 0.6837891927951968, + "loss": 1.4121, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "grad_norm": 1.3487452268600464, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "learning_rate": 0.0007969169686971745, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 2.050325870513916, + "step": 2050 + }, + { + "ce_loss": 0.38634929060935974, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.4903668463230133, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.23304055631160736, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 0.9491183757781982, + "step": 2050 + }, + { + "ce_loss": 0.31107857823371887, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.4289850890636444, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.2086729258298874, + "step": 2050 + }, + { + "epoch": 0.6871247498332221, + "loss": 1.5204, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "grad_norm": 2.084027051925659, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "learning_rate": 0.000796863188133299, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 1.198270320892334, + "step": 2060 + }, + { + "ce_loss": 0.3653385043144226, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.5144132375717163, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.3181459307670593, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 1.4320412874221802, + "step": 2060 + }, + { + "ce_loss": 0.40848636627197266, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.5593544840812683, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.2513773441314697, + "step": 2060 + }, + { + "epoch": 0.6904603068712475, + "loss": 1.3781, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "grad_norm": 1.3339126110076904, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "learning_rate": 0.0007968089443873788, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 1.4610631465911865, + "step": 2070 + }, + { + "ce_loss": 0.44683295488357544, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.5647901296615601, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.3352224826812744, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 1.8598355054855347, + "step": 2070 + }, + { + "ce_loss": 0.4054355323314667, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.5576649904251099, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.2876647412776947, + "step": 2070 + }, + { + "epoch": 0.6937958639092728, + "loss": 1.5379, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "grad_norm": 1.976668357849121, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "learning_rate": 0.000796754237522722, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 1.265521764755249, + "step": 2080 + }, + { + "ce_loss": 0.3824961483478546, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.543056070804596, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.2611978352069855, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 1.7373197078704834, + "step": 2080 + }, + { + "ce_loss": 0.4199475944042206, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.6220147609710693, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.2801419198513031, + "step": 2080 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.4568, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "grad_norm": 2.351896047592163, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "learning_rate": 0.0007966990676031776, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.1849398612976074, + "step": 2090 + }, + { + "ce_loss": 0.34545114636421204, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.5206227898597717, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.23222720623016357, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.408402919769287, + "step": 2090 + }, + { + "ce_loss": 0.38856178522109985, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.6217367053031921, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.29724544286727905, + "step": 2090 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.409, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "grad_norm": 1.8904672861099243, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "learning_rate": 0.0007966434346931348, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.664953351020813, + "step": 2100 + }, + { + "ce_loss": 0.33795663714408875, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.5694255828857422, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.24972786009311676, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.4305306673049927, + "step": 2100 + }, + { + "ce_loss": 0.37841373682022095, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.4339869022369385, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.32812896370887756, + "step": 2100 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.4373, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "grad_norm": 2.0401644706726074, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "learning_rate": 0.000796587338857523, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.8138028383255005, + "step": 2110 + }, + { + "ce_loss": 0.45140936970710754, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.6471231579780579, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.29433050751686096, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.4534084796905518, + "step": 2110 + }, + { + "ce_loss": 0.4121268689632416, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.5959084630012512, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.24375709891319275, + "step": 2110 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.4983, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "grad_norm": 2.243586301803589, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "learning_rate": 0.000796530780161812, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.4822847843170166, + "step": 2120 + }, + { + "ce_loss": 0.4244216978549957, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.5842169523239136, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.27704426646232605, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.61923086643219, + "step": 2120 + }, + { + "ce_loss": 0.4382035434246063, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.45522329211235046, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.2743866741657257, + "step": 2120 + }, + { + "epoch": 0.7104736490993996, + "loss": 1.3543, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "grad_norm": 2.200979709625244, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "learning_rate": 0.0007964737586720123, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 1.6453789472579956, + "step": 2130 + }, + { + "ce_loss": 0.3791709244251251, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.5444095134735107, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.2976440191268921, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 2.0682713985443115, + "step": 2130 + }, + { + "ce_loss": 0.3775831162929535, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.44049978256225586, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.29356908798217773, + "step": 2130 + }, + { + "epoch": 0.7138092061374249, + "loss": 1.3432, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "grad_norm": 2.2284011840820312, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "learning_rate": 0.0007964162744546739, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 1.4169831275939941, + "step": 2140 + }, + { + "ce_loss": 0.35462838411331177, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.4870684742927551, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.2719684839248657, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 1.5132737159729004, + "step": 2140 + }, + { + "ce_loss": 0.38168835639953613, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.5392582416534424, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.25710928440093994, + "step": 2140 + }, + { + "epoch": 0.7171447631754503, + "loss": 1.486, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "grad_norm": 1.8552848100662231, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "learning_rate": 0.000796358327576887, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 1.2554669380187988, + "step": 2150 + }, + { + "ce_loss": 0.3804562985897064, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.6099358201026917, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.26481449604034424, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 1.3438143730163574, + "step": 2150 + }, + { + "ce_loss": 0.378108948469162, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.5566970705986023, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.28770875930786133, + "step": 2150 + }, + { + "epoch": 0.7204803202134756, + "loss": 1.4144, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "grad_norm": 1.7240135669708252, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "learning_rate": 0.0007962999181062819, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 1.444587230682373, + "step": 2160 + }, + { + "ce_loss": 0.3592352867126465, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.5743659734725952, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.25933054089546204, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 1.0968458652496338, + "step": 2160 + }, + { + "ce_loss": 0.367275595664978, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.4670858383178711, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.26240402460098267, + "step": 2160 + }, + { + "epoch": 0.723815877251501, + "loss": 1.4534, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "grad_norm": 1.6614775657653809, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "learning_rate": 0.0007962410461110288, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 1.3979713916778564, + "step": 2170 + }, + { + "ce_loss": 0.4083508551120758, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.528069257736206, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.3267524540424347, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 1.4628068208694458, + "step": 2170 + }, + { + "ce_loss": 0.47597917914390564, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.597661018371582, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.30378589034080505, + "step": 2170 + }, + { + "epoch": 0.7271514342895263, + "loss": 1.4281, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "grad_norm": 1.3479666709899902, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "learning_rate": 0.0007961817116598375, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 1.3016619682312012, + "step": 2180 + }, + { + "ce_loss": 0.43028223514556885, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.5974459052085876, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.2737049162387848, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 1.5070172548294067, + "step": 2180 + }, + { + "ce_loss": 0.3665555715560913, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.5547187328338623, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.28890177607536316, + "step": 2180 + }, + { + "epoch": 0.7304869913275517, + "loss": 1.3728, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "grad_norm": 1.471031904220581, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "learning_rate": 0.0007961219148219578, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 1.1200194358825684, + "step": 2190 + }, + { + "ce_loss": 0.3308461904525757, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.47103047370910645, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.21239201724529266, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 1.172867774963379, + "step": 2190 + }, + { + "ce_loss": 0.31868767738342285, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.5440253615379333, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.2521239221096039, + "step": 2190 + }, + { + "epoch": 0.733822548365577, + "loss": 1.4048, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "grad_norm": 1.6411197185516357, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "learning_rate": 0.000796061655667179, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.8511903285980225, + "step": 2200 + }, + { + "ce_loss": 0.45695799589157104, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.5404932498931885, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.3444337844848633, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.359553575515747, + "step": 2200 + }, + { + "ce_loss": 0.4430965185165405, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.5114693641662598, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.3141864240169525, + "step": 2200 + }, + { + "epoch": 0.7371581054036024, + "loss": 1.3709, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "grad_norm": 3.038661479949951, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "learning_rate": 0.00079600093426583, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 1.3682180643081665, + "step": 2210 + }, + { + "ce_loss": 0.3720366060733795, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.5223158001899719, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.24465817213058472, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 1.923211932182312, + "step": 2210 + }, + { + "ce_loss": 0.4425932765007019, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.5633354783058167, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.24280905723571777, + "step": 2210 + }, + { + "epoch": 0.7404936624416277, + "loss": 1.3085, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "grad_norm": 2.342991590499878, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "learning_rate": 0.0007959397506887793, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 1.3994795083999634, + "step": 2220 + }, + { + "ce_loss": 0.3839331567287445, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.4684339463710785, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.21987880766391754, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 1.223686695098877, + "step": 2220 + }, + { + "ce_loss": 0.37719520926475525, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.528374195098877, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.22812281548976898, + "step": 2220 + }, + { + "epoch": 0.7438292194796531, + "loss": 1.3774, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "grad_norm": 1.3989163637161255, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "learning_rate": 0.0007958781050074347, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 1.081360101699829, + "step": 2230 + }, + { + "ce_loss": 0.26817405223846436, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.5221849083900452, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.18462881445884705, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 1.1048595905303955, + "step": 2230 + }, + { + "ce_loss": 0.33070671558380127, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.5014172196388245, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.19516457617282867, + "step": 2230 + }, + { + "epoch": 0.7471647765176784, + "loss": 1.4234, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "grad_norm": 1.628597378730774, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "learning_rate": 0.0007958159972937432, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 1.2197805643081665, + "step": 2240 + }, + { + "ce_loss": 0.3060518801212311, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.5412315130233765, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.25071093440055847, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 1.3633686304092407, + "step": 2240 + }, + { + "ce_loss": 0.41398224234580994, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.5853712558746338, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.28145208954811096, + "step": 2240 + }, + { + "epoch": 0.7505003335557038, + "loss": 1.4132, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "grad_norm": 1.8897957801818848, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "learning_rate": 0.0007957534276201915, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 1.2674027681350708, + "step": 2250 + }, + { + "ce_loss": 0.3586235046386719, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.5342280268669128, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.2958523631095886, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 1.8458093404769897, + "step": 2250 + }, + { + "ce_loss": 0.4460594058036804, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.552367627620697, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.29993292689323425, + "step": 2250 + }, + { + "epoch": 0.7538358905937291, + "loss": 1.4241, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "grad_norm": 1.2573357820510864, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "learning_rate": 0.0007956903960598048, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 1.100134015083313, + "step": 2260 + }, + { + "ce_loss": 0.40362513065338135, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.42274925112724304, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.27371761202812195, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 1.63364577293396, + "step": 2260 + }, + { + "ce_loss": 0.3939662575721741, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.46868401765823364, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.2829951047897339, + "step": 2260 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.3595, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "grad_norm": 2.485349178314209, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "learning_rate": 0.0007956269026861479, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.4890384674072266, + "step": 2270 + }, + { + "ce_loss": 0.4694492518901825, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.5618162155151367, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.3466222286224365, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.5350507497787476, + "step": 2270 + }, + { + "ce_loss": 0.36870887875556946, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.48704323172569275, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.2637166678905487, + "step": 2270 + }, + { + "epoch": 0.7605070046697798, + "loss": 1.3168, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "grad_norm": 1.4474360942840576, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "learning_rate": 0.0007955629475733243, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 1.5719066858291626, + "step": 2280 + }, + { + "ce_loss": 0.4709181487560272, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.6220474243164062, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.28237760066986084, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 1.221504807472229, + "step": 2280 + }, + { + "ce_loss": 0.38822704553604126, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.5881603956222534, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.24501189589500427, + "step": 2280 + }, + { + "epoch": 0.7638425617078052, + "loss": 1.415, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "grad_norm": 2.043067693710327, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "learning_rate": 0.0007954985307959766, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 1.6120023727416992, + "step": 2290 + }, + { + "ce_loss": 0.3817175030708313, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.5304407477378845, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.28059616684913635, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 1.2804994583129883, + "step": 2290 + }, + { + "ce_loss": 0.3931063711643219, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.5490153431892395, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.2660219967365265, + "step": 2290 + }, + { + "epoch": 0.7671781187458305, + "loss": 1.3366, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "grad_norm": 1.6212903261184692, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "learning_rate": 0.000795433652429286, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 1.1175814867019653, + "step": 2300 + }, + { + "ce_loss": 0.2823010981082916, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.5140184164047241, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.24143068492412567, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 1.392407774925232, + "step": 2300 + }, + { + "ce_loss": 0.42093566060066223, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.4724191427230835, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.2559778690338135, + "step": 2300 + }, + { + "epoch": 0.7705136757838559, + "loss": 1.4132, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "grad_norm": 1.636440634727478, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "learning_rate": 0.0007953683125489726, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 1.2124123573303223, + "step": 2310 + }, + { + "ce_loss": 0.36387935280799866, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.4210546016693115, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.2943533658981323, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 1.2485119104385376, + "step": 2310 + }, + { + "ce_loss": 0.36129215359687805, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.48826050758361816, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.23949351906776428, + "step": 2310 + }, + { + "epoch": 0.7738492328218812, + "loss": 1.3281, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "grad_norm": 1.528756856918335, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "learning_rate": 0.000795302511231295, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 1.0231574773788452, + "step": 2320 + }, + { + "ce_loss": 0.3148442208766937, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.5011311769485474, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.2043459415435791, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 1.7720094919204712, + "step": 2320 + }, + { + "ce_loss": 0.49245187640190125, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.6105131506919861, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.29217514395713806, + "step": 2320 + }, + { + "epoch": 0.7771847898599066, + "loss": 1.4187, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "grad_norm": 1.999721884727478, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "learning_rate": 0.0007952362485530506, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 1.6869237422943115, + "step": 2330 + }, + { + "ce_loss": 0.442462295293808, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.674324631690979, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.24086466431617737, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 1.5707772970199585, + "step": 2330 + }, + { + "ce_loss": 0.3937818109989166, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.6436639428138733, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.26808956265449524, + "step": 2330 + }, + { + "epoch": 0.7805203468979319, + "loss": 1.4501, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "grad_norm": 1.5357840061187744, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "learning_rate": 0.0007951695245915749, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 1.0470746755599976, + "step": 2340 + }, + { + "ce_loss": 0.33574843406677246, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.5088207721710205, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.2020200937986374, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 1.0406668186187744, + "step": 2340 + }, + { + "ce_loss": 0.29405948519706726, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.4793488681316376, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.2668147683143616, + "step": 2340 + }, + { + "epoch": 0.7838559039359573, + "loss": 1.3071, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "grad_norm": 2.38606595993042, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "learning_rate": 0.000795102339424742, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 1.5629955530166626, + "step": 2350 + }, + { + "ce_loss": 0.40422725677490234, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.5844305753707886, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.20816953480243683, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 1.3333702087402344, + "step": 2350 + }, + { + "ce_loss": 0.3669227957725525, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.5442429184913635, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.30379021167755127, + "step": 2350 + }, + { + "epoch": 0.7871914609739826, + "loss": 1.325, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "grad_norm": 2.1126739978790283, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "learning_rate": 0.0007950346931309643, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 1.3229087591171265, + "step": 2360 + }, + { + "ce_loss": 0.39628133177757263, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.48712071776390076, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.23661665618419647, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 1.3167979717254639, + "step": 2360 + }, + { + "ce_loss": 0.3563719093799591, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.4854099154472351, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.24429531395435333, + "step": 2360 + }, + { + "epoch": 0.790527018012008, + "loss": 1.2361, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "grad_norm": 1.4811742305755615, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "learning_rate": 0.0007949665857891921, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 1.2205973863601685, + "step": 2370 + }, + { + "ce_loss": 0.30885571241378784, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.501349925994873, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.21289817988872528, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 1.369713306427002, + "step": 2370 + }, + { + "ce_loss": 0.4380408227443695, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.6267238259315491, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.29107314348220825, + "step": 2370 + }, + { + "epoch": 0.7938625750500333, + "loss": 1.3653, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "grad_norm": 1.7153217792510986, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "learning_rate": 0.0007948980174789142, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 1.2276893854141235, + "step": 2380 + }, + { + "ce_loss": 0.37393128871917725, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.4779031574726105, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.2727866768836975, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 1.245436191558838, + "step": 2380 + }, + { + "ce_loss": 0.39289167523384094, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.4908082187175751, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.2664511501789093, + "step": 2380 + }, + { + "epoch": 0.7971981320880587, + "loss": 1.3731, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "grad_norm": 1.9598758220672607, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "learning_rate": 0.0007948289882801571, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 1.3128260374069214, + "step": 2390 + }, + { + "ce_loss": 0.3758243918418884, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.5747771263122559, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.26265060901641846, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 1.3741884231567383, + "step": 2390 + }, + { + "ce_loss": 0.37096890807151794, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.5457272529602051, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.2744729220867157, + "step": 2390 + }, + { + "epoch": 0.800533689126084, + "loss": 1.4166, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "grad_norm": 1.950492024421692, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "learning_rate": 0.0007947594982734852, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 1.686815857887268, + "step": 2400 + }, + { + "ce_loss": 0.3467777371406555, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.4724452495574951, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.20504708588123322, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 1.8396413326263428, + "step": 2400 + }, + { + "ce_loss": 0.3998579680919647, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.487520694732666, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.34791359305381775, + "step": 2400 + }, + { + "epoch": 0.8038692461641094, + "loss": 1.4165, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "grad_norm": 2.1652491092681885, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "learning_rate": 0.0007946895475400012, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 1.5087261199951172, + "step": 2410 + }, + { + "ce_loss": 0.4045005142688751, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.5614149570465088, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.30835992097854614, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 1.3226900100708008, + "step": 2410 + }, + { + "ce_loss": 0.4158706068992615, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.5947645902633667, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.2430378794670105, + "step": 2410 + }, + { + "epoch": 0.8072048032021347, + "loss": 1.3726, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "grad_norm": 1.8606551885604858, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "learning_rate": 0.0007946191361613447, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 1.2119914293289185, + "step": 2420 + }, + { + "ce_loss": 0.35136666893959045, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.48627468943595886, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.22440162301063538, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 1.457103967666626, + "step": 2420 + }, + { + "ce_loss": 0.3341229259967804, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.5586241483688354, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.24975526332855225, + "step": 2420 + }, + { + "epoch": 0.8105403602401601, + "loss": 1.354, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "grad_norm": 1.524516224861145, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "learning_rate": 0.0007945482642196935, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 1.5340917110443115, + "step": 2430 + }, + { + "ce_loss": 0.5005981922149658, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.6238539814949036, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.23065099120140076, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 1.250186562538147, + "step": 2430 + }, + { + "ce_loss": 0.38786232471466064, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.49738597869873047, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.28271862864494324, + "step": 2430 + }, + { + "epoch": 0.8138759172781854, + "loss": 1.4376, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "grad_norm": 1.48578941822052, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "learning_rate": 0.000794476931797763, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 1.2885890007019043, + "step": 2440 + }, + { + "ce_loss": 0.4292730987071991, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.4880996346473694, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.2967533469200134, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 1.2394975423812866, + "step": 2440 + }, + { + "ce_loss": 0.3743356764316559, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.5021654963493347, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.29697659611701965, + "step": 2440 + }, + { + "epoch": 0.8172114743162108, + "loss": 1.2983, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "grad_norm": 1.5816471576690674, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "learning_rate": 0.0007944051389788053, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 1.1448237895965576, + "step": 2450 + }, + { + "ce_loss": 0.36074283719062805, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.4741750657558441, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.2516009211540222, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 1.317401647567749, + "step": 2450 + }, + { + "ce_loss": 0.3394004702568054, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.466372013092041, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.28243008255958557, + "step": 2450 + }, + { + "epoch": 0.8205470313542361, + "loss": 1.3469, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "grad_norm": 2.4694275856018066, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "learning_rate": 0.0007943328858466108, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 1.3277108669281006, + "step": 2460 + }, + { + "ce_loss": 0.41051825881004333, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.6400450468063354, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.27634891867637634, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 1.2832062244415283, + "step": 2460 + }, + { + "ce_loss": 0.3461655378341675, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.5117206573486328, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.2564898729324341, + "step": 2460 + }, + { + "epoch": 0.8238825883922615, + "loss": 1.3354, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "grad_norm": 1.6225868463516235, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "learning_rate": 0.0007942601724855066, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 1.160365343093872, + "step": 2470 + }, + { + "ce_loss": 0.30365264415740967, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.5082361698150635, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.25682398676872253, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 1.7921531200408936, + "step": 2470 + }, + { + "ce_loss": 0.38309571146965027, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.4826603829860687, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.27224835753440857, + "step": 2470 + }, + { + "epoch": 0.8272181454302868, + "loss": 1.3585, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "grad_norm": 1.32754647731781, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "learning_rate": 0.000794186998980357, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 1.8056671619415283, + "step": 2480 + }, + { + "ce_loss": 0.3759201467037201, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.5555229187011719, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.2487923502922058, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 1.400477409362793, + "step": 2480 + }, + { + "ce_loss": 0.44156867265701294, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.5774922370910645, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.25374627113342285, + "step": 2480 + }, + { + "epoch": 0.8305537024683122, + "loss": 1.3699, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "grad_norm": 1.5362639427185059, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "learning_rate": 0.0007941133654165633, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 1.038097858428955, + "step": 2490 + }, + { + "ce_loss": 0.3375265300273895, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.48798954486846924, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.21024957299232483, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 1.239802598953247, + "step": 2490 + }, + { + "ce_loss": 0.3786744475364685, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.5701245069503784, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.20690640807151794, + "step": 2490 + }, + { + "epoch": 0.8338892595063375, + "loss": 1.364, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "grad_norm": 2.555614709854126, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "learning_rate": 0.0007940392718800637, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 1.0673748254776, + "step": 2500 + }, + { + "ce_loss": 0.29459384083747864, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.4663306474685669, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.2350129783153534, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 1.6143672466278076, + "step": 2500 + }, + { + "ce_loss": 0.5081417560577393, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.7202895879745483, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.31191402673721313, + "step": 2500 + }, + { + "epoch": 0.8372248165443629, + "loss": 1.3549, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "grad_norm": 1.6417618989944458, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "learning_rate": 0.0007939647184573334, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 1.2564659118652344, + "step": 2510 + }, + { + "ce_loss": 0.39230161905288696, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.5175186395645142, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.2814292013645172, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 1.0255411863327026, + "step": 2510 + }, + { + "ce_loss": 0.3256433308124542, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.40438857674598694, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.1968470960855484, + "step": 2510 + }, + { + "epoch": 0.8405603735823882, + "loss": 1.3068, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "grad_norm": 1.896988034248352, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "learning_rate": 0.0007938897052353845, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 1.3056790828704834, + "step": 2520 + }, + { + "ce_loss": 0.34220781922340393, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.5258098840713501, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.2810552418231964, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 1.2250864505767822, + "step": 2520 + }, + { + "ce_loss": 0.3773867189884186, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.4996189475059509, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.2604522705078125, + "step": 2520 + }, + { + "epoch": 0.8438959306204136, + "loss": 1.4173, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "grad_norm": 1.9606977701187134, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "learning_rate": 0.0007938142323017652, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 1.504172444343567, + "step": 2530 + }, + { + "ce_loss": 0.44866085052490234, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.5621837973594666, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.2699384391307831, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 1.3252410888671875, + "step": 2530 + }, + { + "ce_loss": 0.3632029592990875, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.5788899660110474, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.30363452434539795, + "step": 2530 + }, + { + "epoch": 0.8472314876584389, + "loss": 1.3297, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "grad_norm": 1.4642318487167358, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "learning_rate": 0.0007937382997445605, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 1.0956921577453613, + "step": 2540 + }, + { + "ce_loss": 0.3411797285079956, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.46473002433776855, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.19630055129528046, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 1.4409770965576172, + "step": 2540 + }, + { + "ce_loss": 0.43977662920951843, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.5356355905532837, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.2819962799549103, + "step": 2540 + }, + { + "epoch": 0.8505670446964643, + "loss": 1.3112, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "grad_norm": 1.6138858795166016, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "learning_rate": 0.0007936619076523922, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 1.133142113685608, + "step": 2550 + }, + { + "ce_loss": 0.3301082253456116, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.5005925297737122, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.23297664523124695, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 1.3417131900787354, + "step": 2550 + }, + { + "ce_loss": 0.4214775860309601, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.5064700841903687, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.2919166386127472, + "step": 2550 + }, + { + "epoch": 0.8539026017344896, + "loss": 1.4061, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "grad_norm": 3.5541536808013916, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "learning_rate": 0.0007935850561144179, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 1.4824650287628174, + "step": 2560 + }, + { + "ce_loss": 0.40878599882125854, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.5415136218070984, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.30184876918792725, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 1.5436408519744873, + "step": 2560 + }, + { + "ce_loss": 0.4445594847202301, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.5943202376365662, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.24817104637622833, + "step": 2560 + }, + { + "epoch": 0.857238158772515, + "loss": 1.3978, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "grad_norm": 2.0126984119415283, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "learning_rate": 0.0007935077452203315, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 1.287863850593567, + "step": 2570 + }, + { + "ce_loss": 0.3930123746395111, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.3892088532447815, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.2868179976940155, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 1.4019250869750977, + "step": 2570 + }, + { + "ce_loss": 0.45069876313209534, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.4385972023010254, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.30399036407470703, + "step": 2570 + }, + { + "epoch": 0.8605737158105403, + "loss": 1.3875, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "grad_norm": 1.6553137302398682, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "learning_rate": 0.0007934299750603633, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 1.6291496753692627, + "step": 2580 + }, + { + "ce_loss": 0.3738187551498413, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.5318026542663574, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.33766835927963257, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 1.650617003440857, + "step": 2580 + }, + { + "ce_loss": 0.42238470911979675, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.5802122354507446, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.22837448120117188, + "step": 2580 + }, + { + "epoch": 0.8639092728485657, + "loss": 1.3322, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "grad_norm": 1.695788860321045, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "learning_rate": 0.0007933517457252794, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 1.30512273311615, + "step": 2590 + }, + { + "ce_loss": 0.3906150758266449, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.5428471565246582, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.3009546995162964, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 1.2960855960845947, + "step": 2590 + }, + { + "ce_loss": 0.379740834236145, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.6086607575416565, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.22640559077262878, + "step": 2590 + }, + { + "epoch": 0.867244829886591, + "loss": 1.4382, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "grad_norm": 1.937280297279358, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "learning_rate": 0.0007932730573063818, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 1.3920255899429321, + "step": 2600 + }, + { + "ce_loss": 0.47237688302993774, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.523831307888031, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.3163221776485443, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 2.258751392364502, + "step": 2600 + }, + { + "ce_loss": 0.3232567608356476, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.4784823954105377, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.26763230562210083, + "step": 2600 + }, + { + "epoch": 0.8705803869246164, + "loss": 1.3785, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "grad_norm": 2.2580654621124268, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "learning_rate": 0.0007931939098955084, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 1.2365779876708984, + "step": 2610 + }, + { + "ce_loss": 0.29063519835472107, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.6072068214416504, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.23516863584518433, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.9008735418319702, + "step": 2610 + }, + { + "ce_loss": 0.2522759437561035, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.4741603434085846, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.17388072609901428, + "step": 2610 + }, + { + "epoch": 0.8739159439626417, + "loss": 1.2949, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "grad_norm": 2.209432363510132, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "learning_rate": 0.0007931143035850327, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 1.1630470752716064, + "step": 2620 + }, + { + "ce_loss": 0.35149869322776794, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.5646779537200928, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.2467038631439209, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 1.5878159999847412, + "step": 2620 + }, + { + "ce_loss": 0.3178049623966217, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.5251916646957397, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.23808318376541138, + "step": 2620 + }, + { + "epoch": 0.8772515010006671, + "loss": 1.2839, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "grad_norm": 1.3081079721450806, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "learning_rate": 0.0007930342384678639, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 1.6465835571289062, + "step": 2630 + }, + { + "ce_loss": 0.40648844838142395, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.5527106523513794, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.2469692975282669, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 1.1751917600631714, + "step": 2630 + }, + { + "ce_loss": 0.37903863191604614, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.44293490052223206, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.25282037258148193, + "step": 2630 + }, + { + "epoch": 0.8805870580386924, + "loss": 1.3603, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "grad_norm": 2.2184786796569824, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "learning_rate": 0.0007929537146374467, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 1.4768481254577637, + "step": 2640 + }, + { + "ce_loss": 0.33723726868629456, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.6114906668663025, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.2681977450847626, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 1.2050652503967285, + "step": 2640 + }, + { + "ce_loss": 0.31978118419647217, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.5009520649909973, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.1821107119321823, + "step": 2640 + }, + { + "epoch": 0.8839226150767178, + "loss": 1.2414, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "grad_norm": 1.6107494831085205, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "learning_rate": 0.0007928727321877607, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 1.1943227052688599, + "step": 2650 + }, + { + "ce_loss": 0.332078218460083, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.42531952261924744, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.2643662691116333, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.9776313900947571, + "step": 2650 + }, + { + "ce_loss": 0.31022366881370544, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.38916870951652527, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.20252655446529388, + "step": 2650 + }, + { + "epoch": 0.8872581721147431, + "loss": 1.2936, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "grad_norm": 1.5588855743408203, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "learning_rate": 0.0007927912912133215, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 1.7801876068115234, + "step": 2660 + }, + { + "ce_loss": 0.4108361303806305, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.5852051377296448, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.3112657368183136, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 1.6241830587387085, + "step": 2660 + }, + { + "ce_loss": 0.42643311619758606, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.6193053722381592, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.28631946444511414, + "step": 2660 + }, + { + "epoch": 0.8905937291527685, + "loss": 1.3849, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "grad_norm": 2.8556129932403564, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "learning_rate": 0.0007927093918091795, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 1.0800200700759888, + "step": 2670 + }, + { + "ce_loss": 0.3518918752670288, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.42704787850379944, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.3009520471096039, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 1.2904795408248901, + "step": 2670 + }, + { + "ce_loss": 0.37862733006477356, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.5287166237831116, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.27498745918273926, + "step": 2670 + }, + { + "epoch": 0.8939292861907938, + "loss": 1.2638, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "grad_norm": 1.6784123182296753, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "learning_rate": 0.0007926270340709198, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 1.2999699115753174, + "step": 2680 + }, + { + "ce_loss": 0.3899799585342407, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.5541209578514099, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.25357407331466675, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 1.3329912424087524, + "step": 2680 + }, + { + "ce_loss": 0.3716970980167389, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.5012491941452026, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.26823121309280396, + "step": 2680 + }, + { + "epoch": 0.8972648432288192, + "loss": 1.2492, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "grad_norm": 1.9010505676269531, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "learning_rate": 0.0007925442180946629, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 1.6424200534820557, + "step": 2690 + }, + { + "ce_loss": 0.3123776614665985, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.3880579471588135, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.26424267888069153, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 1.2713721990585327, + "step": 2690 + }, + { + "ce_loss": 0.38829106092453003, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.47953101992607117, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.30490636825561523, + "step": 2690 + }, + { + "epoch": 0.9006004002668445, + "loss": 1.2907, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "grad_norm": 1.915202021598816, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "learning_rate": 0.0007924609439770641, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 1.3119174242019653, + "step": 2700 + }, + { + "ce_loss": 0.37728697061538696, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.526457667350769, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.31909599900245667, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 1.0590205192565918, + "step": 2700 + }, + { + "ce_loss": 0.29085272550582886, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.47949159145355225, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.17374452948570251, + "step": 2700 + }, + { + "epoch": 0.9039359573048699, + "loss": 1.3608, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "grad_norm": 1.5788755416870117, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "learning_rate": 0.000792377211815313, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 1.1047440767288208, + "step": 2710 + }, + { + "ce_loss": 0.28150567412376404, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.4963770806789398, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.24585790932178497, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 1.1365015506744385, + "step": 2710 + }, + { + "ce_loss": 0.36603617668151855, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.4976668953895569, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.2073359489440918, + "step": 2710 + }, + { + "epoch": 0.9072715143428952, + "loss": 1.4386, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "grad_norm": 1.7400299310684204, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "learning_rate": 0.0007922930217071344, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 1.2005311250686646, + "step": 2720 + }, + { + "ce_loss": 0.3774307668209076, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.4605158567428589, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.29511210322380066, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 1.293221116065979, + "step": 2720 + }, + { + "ce_loss": 0.31144988536834717, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.47778937220573425, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.27237197756767273, + "step": 2720 + }, + { + "epoch": 0.9106070713809206, + "loss": 1.3218, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "grad_norm": 2.3409981727600098, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "learning_rate": 0.0007922083737507867, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 1.3133599758148193, + "step": 2730 + }, + { + "ce_loss": 0.34370356798171997, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.40750980377197266, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.2877673804759979, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 1.3804914951324463, + "step": 2730 + }, + { + "ce_loss": 0.4484287202358246, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.46959388256073, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.3645693361759186, + "step": 2730 + }, + { + "epoch": 0.9139426284189459, + "loss": 1.2124, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "grad_norm": 2.2181167602539062, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "learning_rate": 0.0007921232680450636, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 1.5099198818206787, + "step": 2740 + }, + { + "ce_loss": 0.4248862862586975, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.5102673172950745, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.28975045680999756, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 1.1082688570022583, + "step": 2740 + }, + { + "ce_loss": 0.3047284185886383, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.5240141153335571, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.2123073786497116, + "step": 2740 + }, + { + "epoch": 0.9172781854569713, + "loss": 1.395, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "grad_norm": 1.5949338674545288, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "learning_rate": 0.0007920377046892926, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 1.2958911657333374, + "step": 2750 + }, + { + "ce_loss": 0.3287883996963501, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.5168973207473755, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.25692057609558105, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 1.4671964645385742, + "step": 2750 + }, + { + "ce_loss": 0.3675439655780792, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.536207914352417, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.2701234221458435, + "step": 2750 + }, + { + "epoch": 0.9206137424949966, + "loss": 1.3855, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "grad_norm": 1.644079566001892, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "learning_rate": 0.0007919516837833351, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 1.8431727886199951, + "step": 2760 + }, + { + "ce_loss": 0.43441736698150635, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.5581046342849731, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.29165101051330566, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 1.910833716392517, + "step": 2760 + }, + { + "ce_loss": 0.44438034296035767, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.575432538986206, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.271775484085083, + "step": 2760 + }, + { + "epoch": 0.923949299533022, + "loss": 1.4388, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "grad_norm": 2.7886414527893066, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "learning_rate": 0.0007918652054275869, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 1.4392640590667725, + "step": 2770 + }, + { + "ce_loss": 0.31815871596336365, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.4943084418773651, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.24593886733055115, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 1.0102663040161133, + "step": 2770 + }, + { + "ce_loss": 0.2735036015510559, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.4617146849632263, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.19889551401138306, + "step": 2770 + }, + { + "epoch": 0.9272848565710473, + "loss": 1.4485, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "grad_norm": 1.6201666593551636, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "learning_rate": 0.0007917782697229776, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 1.2848777770996094, + "step": 2780 + }, + { + "ce_loss": 0.39188921451568604, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.5093104243278503, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.30376869440078735, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 1.471110463142395, + "step": 2780 + }, + { + "ce_loss": 0.4805634617805481, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.549042284488678, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.3469245135784149, + "step": 2780 + }, + { + "epoch": 0.9306204136090727, + "loss": 1.3917, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "grad_norm": 1.7547879219055176, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "learning_rate": 0.0007916908767709703, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 1.1947146654129028, + "step": 2790 + }, + { + "ce_loss": 0.35069945454597473, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.453375905752182, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.2738749086856842, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 1.2321999073028564, + "step": 2790 + }, + { + "ce_loss": 0.2855234742164612, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.40277066826820374, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.18820147216320038, + "step": 2790 + }, + { + "epoch": 0.933955970647098, + "loss": 1.2753, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "grad_norm": 1.6333000659942627, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "learning_rate": 0.0007916030266735622, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 1.1305317878723145, + "step": 2800 + }, + { + "ce_loss": 0.39312195777893066, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.4638335406780243, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.27346083521842957, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 1.0534822940826416, + "step": 2800 + }, + { + "ce_loss": 0.34346362948417664, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.4592100977897644, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.24397408962249756, + "step": 2800 + }, + { + "epoch": 0.9372915276851234, + "loss": 1.2893, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "grad_norm": 1.5109015703201294, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "learning_rate": 0.0007915147195332838, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 1.3544869422912598, + "step": 2810 + }, + { + "ce_loss": 0.4233008027076721, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.515604555606842, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.22143079340457916, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 1.2287521362304688, + "step": 2810 + }, + { + "ce_loss": 0.31915584206581116, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.4785001277923584, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.23532734811306, + "step": 2810 + }, + { + "epoch": 0.9406270847231488, + "loss": 1.5232, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "grad_norm": 2.7087512016296387, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "learning_rate": 0.0007914259554531989, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 1.053950548171997, + "step": 2820 + }, + { + "ce_loss": 0.31700703501701355, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.4933873116970062, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.22106169164180756, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 1.3688534498214722, + "step": 2820 + }, + { + "ce_loss": 0.39123156666755676, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.5485091209411621, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.27018389105796814, + "step": 2820 + }, + { + "epoch": 0.9439626417611742, + "loss": 1.4779, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "grad_norm": 2.3458189964294434, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "learning_rate": 0.0007913367345369048, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 2.1609432697296143, + "step": 2830 + }, + { + "ce_loss": 0.4953167736530304, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.6101577877998352, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.28571856021881104, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 1.1443687677383423, + "step": 2830 + }, + { + "ce_loss": 0.2994648516178131, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.515932023525238, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.2316143661737442, + "step": 2830 + }, + { + "epoch": 0.9472981987991995, + "loss": 1.3814, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "grad_norm": 2.0391764640808105, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "learning_rate": 0.000791247056888532, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 1.5001659393310547, + "step": 2840 + }, + { + "ce_loss": 0.3742694854736328, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.38781672716140747, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.21688145399093628, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 1.1347072124481201, + "step": 2840 + }, + { + "ce_loss": 0.3934358060359955, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.49731510877609253, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.24390080571174622, + "step": 2840 + }, + { + "epoch": 0.9506337558372249, + "loss": 1.3343, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "grad_norm": 1.5864659547805786, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "learning_rate": 0.0007911569226127438, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 1.3999756574630737, + "step": 2850 + }, + { + "ce_loss": 0.29279372096061707, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.49844199419021606, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.22119459509849548, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 1.1619170904159546, + "step": 2850 + }, + { + "ce_loss": 0.334495484828949, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.4583854675292969, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.28474098443984985, + "step": 2850 + }, + { + "epoch": 0.9539693128752502, + "loss": 1.4491, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "grad_norm": 1.9358841180801392, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "learning_rate": 0.0007910663318147368, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 1.1117554903030396, + "step": 2860 + }, + { + "ce_loss": 0.29617729783058167, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.4828736484050751, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.22240567207336426, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 1.3098286390304565, + "step": 2860 + }, + { + "ce_loss": 0.3587772250175476, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.5689359903335571, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.21025978028774261, + "step": 2860 + }, + { + "epoch": 0.9573048699132756, + "loss": 1.4297, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "grad_norm": 1.8924882411956787, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "learning_rate": 0.00079097528460024, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 1.4298171997070312, + "step": 2870 + }, + { + "ce_loss": 0.3467281460762024, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.6316140294075012, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.1883177012205124, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 1.6898854970932007, + "step": 2870 + }, + { + "ce_loss": 0.31687483191490173, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.5835000276565552, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.276297926902771, + "step": 2870 + }, + { + "epoch": 0.9606404269513009, + "loss": 1.3525, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "grad_norm": 1.3143359422683716, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "learning_rate": 0.0007908837810755154, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 1.9876177310943604, + "step": 2880 + }, + { + "ce_loss": 0.3657590448856354, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.48195603489875793, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.23338349163532257, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 1.2629928588867188, + "step": 2880 + }, + { + "ce_loss": 0.3671756088733673, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.5391901135444641, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.2850986123085022, + "step": 2880 + }, + { + "epoch": 0.9639759839893263, + "loss": 1.4018, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "grad_norm": 1.9271106719970703, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "learning_rate": 0.0007907918213473574, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 1.1845937967300415, + "step": 2890 + }, + { + "ce_loss": 0.4012899398803711, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.5150628089904785, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.26796332001686096, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 1.050054669380188, + "step": 2890 + }, + { + "ce_loss": 0.2966417372226715, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.43445032835006714, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.2567691504955292, + "step": 2890 + }, + { + "epoch": 0.9673115410273516, + "loss": 1.2895, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "grad_norm": 1.831406593322754, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "learning_rate": 0.000790699405523093, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 1.1192868947982788, + "step": 2900 + }, + { + "ce_loss": 0.3219367563724518, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.5257210731506348, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.25633111596107483, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 1.2179416418075562, + "step": 2900 + }, + { + "ce_loss": 0.35467156767845154, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.5465186238288879, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.31613627076148987, + "step": 2900 + }, + { + "epoch": 0.970647098065377, + "loss": 1.2964, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "grad_norm": 1.3753669261932373, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "learning_rate": 0.0007906065337105814, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 1.250396728515625, + "step": 2910 + }, + { + "ce_loss": 0.4102431833744049, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.5822953581809998, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.2576345205307007, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 1.4072184562683105, + "step": 2910 + }, + { + "ce_loss": 0.3921785056591034, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.513314962387085, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.2481769621372223, + "step": 2910 + }, + { + "epoch": 0.9739826551034023, + "loss": 1.3336, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "grad_norm": 1.6441563367843628, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "learning_rate": 0.0007905132060182138, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 1.6755125522613525, + "step": 2920 + }, + { + "ce_loss": 0.41334229707717896, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.6247754693031311, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.2773612141609192, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 1.2987523078918457, + "step": 2920 + }, + { + "ce_loss": 0.3157593309879303, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.48028379678726196, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.23197156190872192, + "step": 2920 + }, + { + "epoch": 0.9773182121414277, + "loss": 1.295, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "grad_norm": 2.0256261825561523, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "learning_rate": 0.000790419422554914, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 2.13342022895813, + "step": 2930 + }, + { + "ce_loss": 0.40803754329681396, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.6608292460441589, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.30405136942863464, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 1.7026296854019165, + "step": 2930 + }, + { + "ce_loss": 0.47600406408309937, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.6173453330993652, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.2594636380672455, + "step": 2930 + }, + { + "epoch": 0.980653769179453, + "loss": 1.3692, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "grad_norm": 2.0748965740203857, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "learning_rate": 0.0007903251834301372, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 0.9908037185668945, + "step": 2940 + }, + { + "ce_loss": 0.2691829204559326, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.4488098621368408, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.1903645396232605, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 1.34403657913208, + "step": 2940 + }, + { + "ce_loss": 0.3330442011356354, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.5426937937736511, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.24648922681808472, + "step": 2940 + }, + { + "epoch": 0.9839893262174784, + "loss": 1.2965, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "grad_norm": 1.500327229499817, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "learning_rate": 0.0007902304887538705, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 1.0015138387680054, + "step": 2950 + }, + { + "ce_loss": 0.27747029066085815, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.3931051194667816, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.18407243490219116, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 1.3468683958053589, + "step": 2950 + }, + { + "ce_loss": 0.3932223916053772, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.48848363757133484, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.3180612623691559, + "step": 2950 + }, + { + "epoch": 0.9873248832555037, + "loss": 1.3295, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "grad_norm": 2.592111110687256, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "learning_rate": 0.000790135338636633, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 1.3312675952911377, + "step": 2960 + }, + { + "ce_loss": 0.40466398000717163, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.5289773941040039, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.3111151158809662, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 1.2488740682601929, + "step": 2960 + }, + { + "ce_loss": 0.4030848741531372, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.5210260152816772, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.24048668146133423, + "step": 2960 + }, + { + "epoch": 0.9906604402935291, + "loss": 1.3397, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "grad_norm": 4.065495491027832, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "learning_rate": 0.0007900397331894749, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 1.8174149990081787, + "step": 2970 + }, + { + "ce_loss": 0.40033072233200073, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.48885297775268555, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.27875208854675293, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 1.0776383876800537, + "step": 2970 + }, + { + "ce_loss": 0.30982306599617004, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.4886612296104431, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.18483254313468933, + "step": 2970 + }, + { + "epoch": 0.9939959973315544, + "loss": 1.4159, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "grad_norm": 1.8326114416122437, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "learning_rate": 0.0007899436725239782, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 1.6776779890060425, + "step": 2980 + }, + { + "ce_loss": 0.4594779312610626, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.6358326077461243, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.3393205404281616, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 1.2762123346328735, + "step": 2980 + }, + { + "ce_loss": 0.3621397018432617, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.5921629071235657, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.2265629917383194, + "step": 2980 + }, + { + "epoch": 0.9973315543695798, + "loss": 1.3182, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "grad_norm": 2.937052011489868, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "learning_rate": 0.000789847156752256, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 1.1503270864486694, + "step": 2990 + }, + { + "ce_loss": 0.3201557993888855, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.4587818682193756, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.30107244849205017, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 1.6935726404190063, + "step": 2990 + }, + { + "ce_loss": 0.39489102363586426, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.5594819784164429, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.2576351463794708, + "step": 2990 + }, + { + "epoch": 1.0006671114076051, + "loss": 1.242, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "grad_norm": 1.2817131280899048, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "learning_rate": 0.0007897501859869525, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 1.1143462657928467, + "step": 3000 + }, + { + "ce_loss": 0.31217432022094727, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.5033187866210938, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.2228211611509323, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 1.2181870937347412, + "step": 3000 + }, + { + "ce_loss": 0.34828251600265503, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.5122925639152527, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.2240781933069229, + "step": 3000 + }, + { + "epoch": 1.0040026684456305, + "loss": 1.1703, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "grad_norm": 1.5048034191131592, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "learning_rate": 0.0007896527603412433, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 1.2737293243408203, + "step": 3010 + }, + { + "ce_loss": 0.332550048828125, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.49337321519851685, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.2538461983203888, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 1.1636898517608643, + "step": 3010 + }, + { + "ce_loss": 0.2893487513065338, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.46247366070747375, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.2124393880367279, + "step": 3010 + }, + { + "epoch": 1.0073382254836558, + "loss": 1.224, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "grad_norm": 1.7727106809616089, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "learning_rate": 0.0007895548799288343, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 1.5254015922546387, + "step": 3020 + }, + { + "ce_loss": 0.560138463973999, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.5498660802841187, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.4145788848400116, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 1.0851985216140747, + "step": 3020 + }, + { + "ce_loss": 0.3535903990268707, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.5172708034515381, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.21308091282844543, + "step": 3020 + }, + { + "epoch": 1.0106737825216812, + "loss": 1.3062, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "grad_norm": 2.0452983379364014, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "learning_rate": 0.0007894565448639626, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 1.3256267309188843, + "step": 3030 + }, + { + "ce_loss": 0.35627481341362, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.6277180314064026, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.2603911757469177, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 1.2563767433166504, + "step": 3030 + }, + { + "ce_loss": 0.31319910287857056, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.6499719619750977, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.22245129942893982, + "step": 3030 + }, + { + "epoch": 1.0140093395597065, + "loss": 1.3513, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "grad_norm": 3.9679436683654785, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "learning_rate": 0.0007893577552613957, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 1.243039846420288, + "step": 3040 + }, + { + "ce_loss": 0.37055256962776184, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.5363262295722961, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.27428340911865234, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 1.2832188606262207, + "step": 3040 + }, + { + "ce_loss": 0.3100008964538574, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.5250524878501892, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.19188229739665985, + "step": 3040 + }, + { + "epoch": 1.0173448965977319, + "loss": 1.3488, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "grad_norm": 1.9552648067474365, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "learning_rate": 0.0007892585112364318, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 1.4004244804382324, + "step": 3050 + }, + { + "ce_loss": 0.45617711544036865, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.6841182708740234, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.2595180571079254, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.9640132188796997, + "step": 3050 + }, + { + "ce_loss": 0.2764337360858917, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.42042332887649536, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.17508722841739655, + "step": 3050 + }, + { + "epoch": 1.0206804536357572, + "loss": 1.2818, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "grad_norm": 1.4938023090362549, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "learning_rate": 0.0007891588129048994, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 1.051990270614624, + "step": 3060 + }, + { + "ce_loss": 0.30879828333854675, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.4311293959617615, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.2406865358352661, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 1.816645622253418, + "step": 3060 + }, + { + "ce_loss": 0.3380657136440277, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.4694381058216095, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.24928055703639984, + "step": 3060 + }, + { + "epoch": 1.0240160106737826, + "loss": 1.2024, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "grad_norm": 1.9541714191436768, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "learning_rate": 0.000789058660383157, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 1.106635570526123, + "step": 3070 + }, + { + "ce_loss": 0.2738843262195587, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.43765518069267273, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.21291719377040863, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 1.5708824396133423, + "step": 3070 + }, + { + "ce_loss": 0.34916871786117554, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.4993576109409332, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.2292649745941162, + "step": 3070 + }, + { + "epoch": 1.027351567711808, + "loss": 1.279, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "grad_norm": 2.1405253410339355, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "learning_rate": 0.0007889580537880937, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 1.29014253616333, + "step": 3080 + }, + { + "ce_loss": 0.35143256187438965, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.5421102643013, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.23601114749908447, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 1.138009786605835, + "step": 3080 + }, + { + "ce_loss": 0.31548169255256653, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.5134023427963257, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.1867564469575882, + "step": 3080 + }, + { + "epoch": 1.0306871247498333, + "loss": 1.2572, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "grad_norm": 1.6170761585235596, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "learning_rate": 0.0007888569932371277, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 1.2278735637664795, + "step": 3090 + }, + { + "ce_loss": 0.3722643256187439, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.531057596206665, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.2565631568431854, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 1.568192958831787, + "step": 3090 + }, + { + "ce_loss": 0.3835557997226715, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.550069272518158, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.3273247182369232, + "step": 3090 + }, + { + "epoch": 1.0340226817878586, + "loss": 1.2316, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "grad_norm": 1.3664284944534302, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "learning_rate": 0.0007887554788482082, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 1.4323724508285522, + "step": 3100 + }, + { + "ce_loss": 0.38257670402526855, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.4578417241573334, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.2541353702545166, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 1.0234249830245972, + "step": 3100 + }, + { + "ce_loss": 0.303827166557312, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.4745440185070038, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.18835636973381042, + "step": 3100 + }, + { + "epoch": 1.037358238825884, + "loss": 1.2003, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "grad_norm": 1.3299576044082642, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "learning_rate": 0.0007886535107398128, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 1.4359586238861084, + "step": 3110 + }, + { + "ce_loss": 0.3999103903770447, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.5249671339988708, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.25556695461273193, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 1.2247552871704102, + "step": 3110 + }, + { + "ce_loss": 0.34925416111946106, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.47998103499412537, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.22177754342556, + "step": 3110 + }, + { + "epoch": 1.0406937958639093, + "loss": 1.3417, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "grad_norm": 1.6552432775497437, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "learning_rate": 0.0007885510890309498, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 1.7250030040740967, + "step": 3120 + }, + { + "ce_loss": 0.34814757108688354, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.6008387207984924, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.2179640829563141, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 1.2385932207107544, + "step": 3120 + }, + { + "ce_loss": 0.35398566722869873, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.4646395146846771, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.33056381344795227, + "step": 3120 + }, + { + "epoch": 1.0440293529019347, + "loss": 1.3849, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "grad_norm": 3.487548589706421, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "learning_rate": 0.0007884482138411558, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 1.1850218772888184, + "step": 3130 + }, + { + "ce_loss": 0.3026364743709564, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.4985314905643463, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.1886860877275467, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 1.2199435234069824, + "step": 3130 + }, + { + "ce_loss": 0.34935158491134644, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.5073997974395752, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.28104153275489807, + "step": 3130 + }, + { + "epoch": 1.04736490993996, + "loss": 1.2998, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "grad_norm": 1.450993299484253, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "learning_rate": 0.0007883448852904976, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 1.0731884241104126, + "step": 3140 + }, + { + "ce_loss": 0.2625713348388672, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.40948688983917236, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.20579855144023895, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 1.4011085033416748, + "step": 3140 + }, + { + "ce_loss": 0.3683658540248871, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.3941497206687927, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.2790229320526123, + "step": 3140 + }, + { + "epoch": 1.0507004669779854, + "loss": 1.2559, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "grad_norm": 1.5150136947631836, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "learning_rate": 0.0007882411034995705, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 1.803234338760376, + "step": 3150 + }, + { + "ce_loss": 0.3541768193244934, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.5089897513389587, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.2962924540042877, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 1.4103766679763794, + "step": 3150 + }, + { + "ce_loss": 0.4498169720172882, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.5849369168281555, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.2743041217327118, + "step": 3150 + }, + { + "epoch": 1.0540360240160107, + "loss": 1.3988, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "grad_norm": 1.3503497838974, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "learning_rate": 0.0007881368685894993, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 1.058760166168213, + "step": 3160 + }, + { + "ce_loss": 0.27203452587127686, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.49636560678482056, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.2184726893901825, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 1.3550480604171753, + "step": 3160 + }, + { + "ce_loss": 0.33902209997177124, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.540988028049469, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.2227366715669632, + "step": 3160 + }, + { + "epoch": 1.057371581054036, + "loss": 1.3205, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "grad_norm": 3.3052151203155518, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "learning_rate": 0.0007880321806819372, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 1.5313291549682617, + "step": 3170 + }, + { + "ce_loss": 0.2738790512084961, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.38085490465164185, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.24050869047641754, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 1.3720393180847168, + "step": 3170 + }, + { + "ce_loss": 0.3660949766635895, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.4232823848724365, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.22027719020843506, + "step": 3170 + }, + { + "epoch": 1.0607071380920614, + "loss": 1.2552, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "grad_norm": 1.9380136728286743, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "learning_rate": 0.0007879270398990663, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 1.4154558181762695, + "step": 3180 + }, + { + "ce_loss": 0.3620558977127075, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.5165147185325623, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.2523062825202942, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 1.2843544483184814, + "step": 3180 + }, + { + "ce_loss": 0.34891369938850403, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.46376562118530273, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.2205934226512909, + "step": 3180 + }, + { + "epoch": 1.0640426951300868, + "loss": 1.2705, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "grad_norm": 2.306701421737671, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "learning_rate": 0.000787821446363597, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 1.8948969841003418, + "step": 3190 + }, + { + "ce_loss": 0.3148987293243408, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.4278886318206787, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.21655955910682678, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 1.5901086330413818, + "step": 3190 + }, + { + "ce_loss": 0.44442218542099, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.5294621586799622, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.23455531895160675, + "step": 3190 + }, + { + "epoch": 1.067378252168112, + "loss": 1.2659, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "grad_norm": 1.9085355997085571, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "learning_rate": 0.0007877154001987686, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 1.9646542072296143, + "step": 3200 + }, + { + "ce_loss": 0.3970075845718384, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.5007684826850891, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.3607442378997803, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 1.1343801021575928, + "step": 3200 + }, + { + "ce_loss": 0.30973246693611145, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.5078864693641663, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.22439567744731903, + "step": 3200 + }, + { + "epoch": 1.0707138092061375, + "loss": 1.2294, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "grad_norm": 2.184018850326538, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "learning_rate": 0.0007876089015283481, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 1.1813832521438599, + "step": 3210 + }, + { + "ce_loss": 0.30030760169029236, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.4533817172050476, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.21787992119789124, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 1.1257771253585815, + "step": 3210 + }, + { + "ce_loss": 0.3650830388069153, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.43343934416770935, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.24671237170696259, + "step": 3210 + }, + { + "epoch": 1.0740493662441628, + "loss": 1.6693, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "grad_norm": 2.01936411857605, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "learning_rate": 0.0007875019504766312, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 1.2749767303466797, + "step": 3220 + }, + { + "ce_loss": 0.3731911778450012, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.4127843677997589, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.2641950845718384, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 1.7697055339813232, + "step": 3220 + }, + { + "ce_loss": 0.35934922099113464, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.48314180970191956, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.2707027196884155, + "step": 3220 + }, + { + "epoch": 1.0773849232821882, + "loss": 1.3645, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "grad_norm": 1.6033449172973633, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "learning_rate": 0.0007873945471684412, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 1.1107524633407593, + "step": 3230 + }, + { + "ce_loss": 0.3217277228832245, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.42127346992492676, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.26517701148986816, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 1.2702422142028809, + "step": 3230 + }, + { + "ce_loss": 0.3804514408111572, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.47932273149490356, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.21051423251628876, + "step": 3230 + }, + { + "epoch": 1.0807204803202135, + "loss": 1.1673, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "grad_norm": 1.7848643064498901, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "learning_rate": 0.0007872866917291293, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 1.1999231576919556, + "step": 3240 + }, + { + "ce_loss": 0.4094650447368622, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.47257867455482483, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.2415315955877304, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 1.1600497961044312, + "step": 3240 + }, + { + "ce_loss": 0.3341686427593231, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.3184273838996887, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.2142072468996048, + "step": 3240 + }, + { + "epoch": 1.0840560373582389, + "loss": 1.1987, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "grad_norm": 1.5663069486618042, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "learning_rate": 0.0007871783842845741, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 1.2865146398544312, + "step": 3250 + }, + { + "ce_loss": 0.3988375663757324, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.5756269693374634, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.3118850588798523, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 2.4615437984466553, + "step": 3250 + }, + { + "ce_loss": 0.48707595467567444, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.5528659820556641, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.35930198431015015, + "step": 3250 + }, + { + "epoch": 1.0873915943962642, + "loss": 1.2931, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "grad_norm": 1.8359395265579224, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "learning_rate": 0.0007870696249611827, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 1.341639518737793, + "step": 3260 + }, + { + "ce_loss": 0.3619093894958496, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.5633329153060913, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.22083061933517456, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 1.116979718208313, + "step": 3260 + }, + { + "ce_loss": 0.31899040937423706, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.544400155544281, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.25317904353141785, + "step": 3260 + }, + { + "epoch": 1.0907271514342896, + "loss": 1.2166, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "grad_norm": 1.9730409383773804, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "learning_rate": 0.0007869604138858883, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 1.2893974781036377, + "step": 3270 + }, + { + "ce_loss": 0.39021432399749756, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.6379407048225403, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.26053154468536377, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 1.4087176322937012, + "step": 3270 + }, + { + "ce_loss": 0.38117286562919617, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.5666294097900391, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.24387907981872559, + "step": 3270 + }, + { + "epoch": 1.094062708472315, + "loss": 1.2378, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "grad_norm": 1.3416624069213867, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "learning_rate": 0.0007868507511861523, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 1.1847916841506958, + "step": 3280 + }, + { + "ce_loss": 0.34078463912010193, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.5022830963134766, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.25572776794433594, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 1.2643036842346191, + "step": 3280 + }, + { + "ce_loss": 0.30773019790649414, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.49978795647621155, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.23188060522079468, + "step": 3280 + }, + { + "epoch": 1.0973982655103403, + "loss": 1.3057, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "grad_norm": 1.837242841720581, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "learning_rate": 0.0007867406369899628, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 1.1789145469665527, + "step": 3290 + }, + { + "ce_loss": 0.3441447615623474, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.5966684818267822, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.23769469559192657, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 1.3219236135482788, + "step": 3290 + }, + { + "ce_loss": 0.4166383743286133, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.5886305570602417, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.31640011072158813, + "step": 3290 + }, + { + "epoch": 1.1007338225483656, + "loss": 1.2599, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "grad_norm": 1.8349952697753906, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "learning_rate": 0.0007866300714258349, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 1.3195645809173584, + "step": 3300 + }, + { + "ce_loss": 0.3108004033565521, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.4452487528324127, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.24651941657066345, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 1.1526429653167725, + "step": 3300 + }, + { + "ce_loss": 0.3125385642051697, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.45304566621780396, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.20057500898838043, + "step": 3300 + }, + { + "epoch": 1.104069379586391, + "loss": 1.3455, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "grad_norm": 2.6072306632995605, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "learning_rate": 0.0007865190546228107, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 1.2560369968414307, + "step": 3310 + }, + { + "ce_loss": 0.37156903743743896, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.5403450131416321, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.26321572065353394, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 1.3976691961288452, + "step": 3310 + }, + { + "ce_loss": 0.40832340717315674, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.5309417247772217, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.28758132457733154, + "step": 3310 + }, + { + "epoch": 1.1074049366244163, + "loss": 1.4654, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "grad_norm": 1.5962499380111694, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "learning_rate": 0.0007864075867104584, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 1.1697372198104858, + "step": 3320 + }, + { + "ce_loss": 0.3503996431827545, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.5424225926399231, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.27611243724823, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 1.1543195247650146, + "step": 3320 + }, + { + "ce_loss": 0.3605566620826721, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.5575127601623535, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.23536986112594604, + "step": 3320 + }, + { + "epoch": 1.1107404936624417, + "loss": 1.1974, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "grad_norm": 1.4378269910812378, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "learning_rate": 0.0007862956678188732, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 2.0673465728759766, + "step": 3330 + }, + { + "ce_loss": 0.29359567165374756, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.44125255942344666, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.2021653950214386, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 1.4126696586608887, + "step": 3330 + }, + { + "ce_loss": 0.4507080018520355, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.5015628337860107, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.2881172001361847, + "step": 3330 + }, + { + "epoch": 1.114076050700467, + "loss": 1.3471, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "grad_norm": 1.4997214078903198, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "learning_rate": 0.0007861832980786765, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 1.090973973274231, + "step": 3340 + }, + { + "ce_loss": 0.3212396204471588, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.5364630222320557, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.23215319216251373, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 1.3080556392669678, + "step": 3340 + }, + { + "ce_loss": 0.37442025542259216, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.6098906397819519, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.2560647130012512, + "step": 3340 + }, + { + "epoch": 1.1174116077384924, + "loss": 1.2975, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "grad_norm": 1.920807957649231, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "learning_rate": 0.0007860704776210161, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 1.2911982536315918, + "step": 3350 + }, + { + "ce_loss": 0.3540288805961609, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.5088047981262207, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.2307043820619583, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 1.1055140495300293, + "step": 3350 + }, + { + "ce_loss": 0.3683701455593109, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.45363694429397583, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.20422856509685516, + "step": 3350 + }, + { + "epoch": 1.1207471647765177, + "loss": 1.3233, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "grad_norm": 1.5294970273971558, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "learning_rate": 0.0007859572065775654, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.9781809449195862, + "step": 3360 + }, + { + "ce_loss": 0.26756465435028076, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.468820184469223, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.2416294515132904, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 1.2498588562011719, + "step": 3360 + }, + { + "ce_loss": 0.3570075035095215, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.5418039560317993, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.19507354497909546, + "step": 3360 + }, + { + "epoch": 1.124082721814543, + "loss": 1.3759, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "grad_norm": 3.4274094104766846, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "learning_rate": 0.0007858434850805238, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 1.7688602209091187, + "step": 3370 + }, + { + "ce_loss": 0.37512704730033875, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.7253754734992981, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.2692413926124573, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 1.830051302909851, + "step": 3370 + }, + { + "ce_loss": 0.36339071393013, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.6195383667945862, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.23754560947418213, + "step": 3370 + }, + { + "epoch": 1.1274182788525684, + "loss": 1.4666, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "grad_norm": 2.594028949737549, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "learning_rate": 0.0007857293132626166, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 1.2381805181503296, + "step": 3380 + }, + { + "ce_loss": 0.3283141255378723, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.4507133662700653, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.2803668677806854, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.9831138253211975, + "step": 3380 + }, + { + "ce_loss": 0.35459038615226746, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.37437888979911804, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.25287047028541565, + "step": 3380 + }, + { + "epoch": 1.1307538358905938, + "loss": 1.1624, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "grad_norm": 1.9503041505813599, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "learning_rate": 0.0007856146912570947, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 1.2150638103485107, + "step": 3390 + }, + { + "ce_loss": 0.4028579592704773, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.5170663595199585, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.2950234115123749, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 1.2264457941055298, + "step": 3390 + }, + { + "ce_loss": 0.36099863052368164, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.5292174816131592, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.24654163420200348, + "step": 3390 + }, + { + "epoch": 1.134089392928619, + "loss": 1.2931, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "grad_norm": 1.8613334894180298, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "learning_rate": 0.0007854996191977343, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 1.1026091575622559, + "step": 3400 + }, + { + "ce_loss": 0.33049890398979187, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.44255223870277405, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.16969634592533112, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 1.1961816549301147, + "step": 3400 + }, + { + "ce_loss": 0.38204365968704224, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.5366348624229431, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.211135596036911, + "step": 3400 + }, + { + "epoch": 1.1374249499666444, + "loss": 1.2634, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "grad_norm": 1.833163857460022, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "learning_rate": 0.0007853840972188367, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 1.0167384147644043, + "step": 3410 + }, + { + "ce_loss": 0.2936549186706543, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.48618417978286743, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.23391129076480865, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 1.0690950155258179, + "step": 3410 + }, + { + "ce_loss": 0.28655749559402466, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.4551718831062317, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.22481927275657654, + "step": 3410 + }, + { + "epoch": 1.1407605070046698, + "loss": 1.2891, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "grad_norm": 1.7783701419830322, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "learning_rate": 0.0007852681254552286, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 1.2163112163543701, + "step": 3420 + }, + { + "ce_loss": 0.36884644627571106, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.4819245934486389, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.2516787648200989, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 1.3230206966400146, + "step": 3420 + }, + { + "ce_loss": 0.36667966842651367, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.47532573342323303, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.23686495423316956, + "step": 3420 + }, + { + "epoch": 1.1440960640426951, + "loss": 1.3354, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "grad_norm": 1.9137823581695557, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "learning_rate": 0.0007851517040422617, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 1.2071892023086548, + "step": 3430 + }, + { + "ce_loss": 0.35587406158447266, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.42205366492271423, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.2519875466823578, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 1.076033592224121, + "step": 3430 + }, + { + "ce_loss": 0.27833545207977295, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.49027466773986816, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.22420674562454224, + "step": 3430 + }, + { + "epoch": 1.1474316210807205, + "loss": 1.2992, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "grad_norm": 1.7274442911148071, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "learning_rate": 0.0007850348331158119, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 1.2800071239471436, + "step": 3440 + }, + { + "ce_loss": 0.4170500338077545, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.47338342666625977, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.30712056159973145, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 1.2456332445144653, + "step": 3440 + }, + { + "ce_loss": 0.3563118875026703, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.36735472083091736, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.25647401809692383, + "step": 3440 + }, + { + "epoch": 1.1507671781187458, + "loss": 1.2522, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "grad_norm": 1.9632582664489746, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "learning_rate": 0.0007849175128122806, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 1.0870206356048584, + "step": 3450 + }, + { + "ce_loss": 0.2861756384372711, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.5420786738395691, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.2082960158586502, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 1.270479679107666, + "step": 3450 + }, + { + "ce_loss": 0.4658251404762268, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.5474481582641602, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.25603190064430237, + "step": 3450 + }, + { + "epoch": 1.1541027351567712, + "loss": 1.3792, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "grad_norm": 1.5725535154342651, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "learning_rate": 0.0007847997432685929, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 1.2822141647338867, + "step": 3460 + }, + { + "ce_loss": 0.3111097812652588, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.4790066182613373, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.27783533930778503, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 2.144399404525757, + "step": 3460 + }, + { + "ce_loss": 0.35182690620422363, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.5352278351783752, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.24378477036952972, + "step": 3460 + }, + { + "epoch": 1.1574382921947965, + "loss": 1.2176, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "grad_norm": 2.390889883041382, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "learning_rate": 0.0007846815246221986, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 1.5885863304138184, + "step": 3470 + }, + { + "ce_loss": 0.47593382000923157, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.702208936214447, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.31476113200187683, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 1.1008955240249634, + "step": 3470 + }, + { + "ce_loss": 0.32944154739379883, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.4527980387210846, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.23255252838134766, + "step": 3470 + }, + { + "epoch": 1.160773849232822, + "loss": 1.2574, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "grad_norm": 2.0811338424682617, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "learning_rate": 0.0007845628570110716, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 1.130537748336792, + "step": 3480 + }, + { + "ce_loss": 0.3342140316963196, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.508356511592865, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.20144306123256683, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 1.4564712047576904, + "step": 3480 + }, + { + "ce_loss": 0.4794396460056305, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.5904377698898315, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.3139699697494507, + "step": 3480 + }, + { + "epoch": 1.1641094062708472, + "loss": 1.2398, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "grad_norm": 2.8266541957855225, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "learning_rate": 0.00078444374057371, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 2.1026384830474854, + "step": 3490 + }, + { + "ce_loss": 0.2914462983608246, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.45004281401634216, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.17569176852703094, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 1.6855539083480835, + "step": 3490 + }, + { + "ce_loss": 0.35392481088638306, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.5410393476486206, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.20788775384426117, + "step": 3490 + }, + { + "epoch": 1.1674449633088726, + "loss": 1.3848, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "grad_norm": 2.1698193550109863, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "learning_rate": 0.0007843241754491351, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 1.4109277725219727, + "step": 3500 + }, + { + "ce_loss": 0.4122583270072937, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.6634799242019653, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.26556360721588135, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 2.186739683151245, + "step": 3500 + }, + { + "ce_loss": 0.35684934258461, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.6196172833442688, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.27541086077690125, + "step": 3500 + }, + { + "epoch": 1.170780520346898, + "loss": 1.4175, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "grad_norm": 2.025646209716797, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "learning_rate": 0.0007842041617768926, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 1.4531854391098022, + "step": 3510 + }, + { + "ce_loss": 0.36734527349472046, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.6254667639732361, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.23278909921646118, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 1.8889970779418945, + "step": 3510 + }, + { + "ce_loss": 0.3474090099334717, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.5200473666191101, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.2619593143463135, + "step": 3510 + }, + { + "epoch": 1.1741160773849233, + "loss": 1.2872, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "grad_norm": 1.905614972114563, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "learning_rate": 0.0007840836996970511, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 1.0105056762695312, + "step": 3520 + }, + { + "ce_loss": 0.29185229539871216, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.4211788773536682, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.21574154496192932, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 1.386340856552124, + "step": 3520 + }, + { + "ce_loss": 0.3471980392932892, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.4527163803577423, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.23550935089588165, + "step": 3520 + }, + { + "epoch": 1.1774516344229486, + "loss": 1.2977, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "grad_norm": 2.2812416553497314, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "learning_rate": 0.0007839627893502031, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 1.3279774188995361, + "step": 3530 + }, + { + "ce_loss": 0.37937167286872864, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.5612952709197998, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.24463312327861786, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 1.4569265842437744, + "step": 3530 + }, + { + "ce_loss": 0.3426618278026581, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.5634005665779114, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.26913848519325256, + "step": 3530 + }, + { + "epoch": 1.180787191460974, + "loss": 1.2552, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "grad_norm": 1.6602412462234497, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "learning_rate": 0.0007838414308774637, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 1.6292023658752441, + "step": 3540 + }, + { + "ce_loss": 0.4299602806568146, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.5841928124427795, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.2711060345172882, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 1.1693357229232788, + "step": 3540 + }, + { + "ce_loss": 0.3660375475883484, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.5180224180221558, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.2825009226799011, + "step": 3540 + }, + { + "epoch": 1.1841227484989993, + "loss": 1.2843, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "grad_norm": 2.6830334663391113, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "learning_rate": 0.0007837196244204714, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 1.256629467010498, + "step": 3550 + }, + { + "ce_loss": 0.37410256266593933, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.47845524549484253, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.29637405276298523, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 1.1663440465927124, + "step": 3550 + }, + { + "ce_loss": 0.33838000893592834, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.45741572976112366, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.23389117419719696, + "step": 3550 + }, + { + "epoch": 1.1874583055370247, + "loss": 1.3448, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "grad_norm": 1.710472583770752, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "learning_rate": 0.0007835973701213874, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 1.1803218126296997, + "step": 3560 + }, + { + "ce_loss": 0.3162272274494171, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.4722076952457428, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.21254397928714752, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 1.2828024625778198, + "step": 3560 + }, + { + "ce_loss": 0.37789955735206604, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.5199463963508606, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.31490829586982727, + "step": 3560 + }, + { + "epoch": 1.19079386257505, + "loss": 1.3718, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "grad_norm": 1.7363239526748657, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "learning_rate": 0.0007834746681228959, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 1.1682206392288208, + "step": 3570 + }, + { + "ce_loss": 0.39901384711265564, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.410632848739624, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.2826874852180481, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 1.0605826377868652, + "step": 3570 + }, + { + "ce_loss": 0.3148486912250519, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.368727445602417, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.19040748476982117, + "step": 3570 + }, + { + "epoch": 1.1941294196130754, + "loss": 1.1801, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "grad_norm": 1.7157187461853027, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "learning_rate": 0.000783351518568203, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 1.5664011240005493, + "step": 3580 + }, + { + "ce_loss": 0.3691982328891754, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.5537074208259583, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.2542349398136139, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 1.4214699268341064, + "step": 3580 + }, + { + "ce_loss": 0.3670927882194519, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.536470890045166, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.259833961725235, + "step": 3580 + }, + { + "epoch": 1.1974649766511007, + "loss": 1.3455, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "grad_norm": 2.5788187980651855, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "learning_rate": 0.0007832279216010375, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 1.2196593284606934, + "step": 3590 + }, + { + "ce_loss": 0.4030509889125824, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.5046036243438721, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.3118016719818115, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 1.4102113246917725, + "step": 3590 + }, + { + "ce_loss": 0.39648792147636414, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.589056134223938, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.22151829302310944, + "step": 3590 + }, + { + "epoch": 1.200800533689126, + "loss": 1.2565, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "grad_norm": 2.462405204772949, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "learning_rate": 0.0007831038773656506, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 1.3264976739883423, + "step": 3600 + }, + { + "ce_loss": 0.36774370074272156, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.5057428479194641, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.22975876927375793, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 1.2413110733032227, + "step": 3600 + }, + { + "ce_loss": 0.3056010901927948, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.48162832856178284, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.23536323010921478, + "step": 3600 + }, + { + "epoch": 1.2041360907271514, + "loss": 1.3204, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "grad_norm": 1.8102093935012817, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "learning_rate": 0.0007829793860068151, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 1.1256744861602783, + "step": 3610 + }, + { + "ce_loss": 0.3574826419353485, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.514342725276947, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.2532525658607483, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 1.1662226915359497, + "step": 3610 + }, + { + "ce_loss": 0.3443499207496643, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.5596121549606323, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.2618454396724701, + "step": 3610 + }, + { + "epoch": 1.2074716477651768, + "loss": 1.1875, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "grad_norm": 1.8104478120803833, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "learning_rate": 0.0007828544476698258, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 1.4878114461898804, + "step": 3620 + }, + { + "ce_loss": 0.2988416254520416, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.4803342819213867, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.2804744243621826, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 1.1582154035568237, + "step": 3620 + }, + { + "ce_loss": 0.396658718585968, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.47372835874557495, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.2876175343990326, + "step": 3620 + }, + { + "epoch": 1.2108072048032021, + "loss": 1.1445, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "grad_norm": 1.6298176050186157, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "learning_rate": 0.0007827290625004993, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 1.3248764276504517, + "step": 3630 + }, + { + "ce_loss": 0.27193018794059753, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.43337440490722656, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.26712289452552795, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 1.4135502576828003, + "step": 3630 + }, + { + "ce_loss": 0.30455535650253296, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.47315406799316406, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.2176881581544876, + "step": 3630 + }, + { + "epoch": 1.2141427618412275, + "loss": 1.2222, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "grad_norm": 1.8237580060958862, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "learning_rate": 0.0007826032306451734, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 1.219295859336853, + "step": 3640 + }, + { + "ce_loss": 0.4064521789550781, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.5045239329338074, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.23610226809978485, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 1.324281096458435, + "step": 3640 + }, + { + "ce_loss": 0.39209476113319397, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.574234664440155, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.2725990414619446, + "step": 3640 + }, + { + "epoch": 1.2174783188792528, + "loss": 1.3203, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "grad_norm": 2.246127128601074, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "learning_rate": 0.0007824769522507076, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 1.3088812828063965, + "step": 3650 + }, + { + "ce_loss": 0.36943188309669495, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.5165199041366577, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.3023454248905182, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 1.0939652919769287, + "step": 3650 + }, + { + "ce_loss": 0.2995774447917938, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.5135769844055176, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.2256573736667633, + "step": 3650 + }, + { + "epoch": 1.2208138759172782, + "loss": 1.2701, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "grad_norm": 2.1431121826171875, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "learning_rate": 0.0007823502274644823, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 1.0833626985549927, + "step": 3660 + }, + { + "ce_loss": 0.3039146661758423, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.4804261028766632, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.2301558554172516, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 1.9844090938568115, + "step": 3660 + }, + { + "ce_loss": 0.31650522351264954, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.4741988182067871, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.19971761107444763, + "step": 3660 + }, + { + "epoch": 1.2241494329553035, + "loss": 1.3489, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "grad_norm": 1.5589141845703125, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "learning_rate": 0.000782223056434399, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 1.1556637287139893, + "step": 3670 + }, + { + "ce_loss": 0.3101668953895569, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.46484261751174927, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.1859927624464035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 1.2446397542953491, + "step": 3670 + }, + { + "ce_loss": 0.368746280670166, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.5332959294319153, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.2705717980861664, + "step": 3670 + }, + { + "epoch": 1.227484989993329, + "loss": 1.2077, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "grad_norm": 1.59275221824646, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "learning_rate": 0.0007820954393088799, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 1.3181159496307373, + "step": 3680 + }, + { + "ce_loss": 0.34320712089538574, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.405369371175766, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.2773212790489197, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 1.2116681337356567, + "step": 3680 + }, + { + "ce_loss": 0.39864087104797363, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.4519028663635254, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.28923535346984863, + "step": 3680 + }, + { + "epoch": 1.2308205470313542, + "loss": 1.2474, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "grad_norm": 2.0720341205596924, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "learning_rate": 0.0007819673762368679, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 1.9078865051269531, + "step": 3690 + }, + { + "ce_loss": 0.38197579979896545, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.47684934735298157, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.26847782731056213, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 0.9596579670906067, + "step": 3690 + }, + { + "ce_loss": 0.26398709416389465, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.417118638753891, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.1989685297012329, + "step": 3690 + }, + { + "epoch": 1.2341561040693796, + "loss": 1.269, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "grad_norm": 1.8928744792938232, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "learning_rate": 0.0007818388673678265, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 1.292384386062622, + "step": 3700 + }, + { + "ce_loss": 0.3532315492630005, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.4385555386543274, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.23133112490177155, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 1.2692075967788696, + "step": 3700 + }, + { + "ce_loss": 0.373373806476593, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.561535120010376, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.25908219814300537, + "step": 3700 + }, + { + "epoch": 1.237491661107405, + "loss": 1.2582, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "grad_norm": 1.5493221282958984, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "learning_rate": 0.0007817099128517393, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 1.013564944267273, + "step": 3710 + }, + { + "ce_loss": 0.3018394112586975, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.478537380695343, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.23245984315872192, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 1.2901955842971802, + "step": 3710 + }, + { + "ce_loss": 0.2493903636932373, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.44054585695266724, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.22715501487255096, + "step": 3710 + }, + { + "epoch": 1.2408272181454303, + "loss": 1.2502, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "grad_norm": 1.918712854385376, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "learning_rate": 0.0007815805128391102, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.9254280924797058, + "step": 3720 + }, + { + "ce_loss": 0.25510698556900024, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.4281381070613861, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.24195969104766846, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 1.0331034660339355, + "step": 3720 + }, + { + "ce_loss": 0.3291766941547394, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.4442521631717682, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.25957420468330383, + "step": 3720 + }, + { + "epoch": 1.2441627751834556, + "loss": 1.2901, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "grad_norm": 1.9499536752700806, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "learning_rate": 0.0007814506674809627, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 1.287750482559204, + "step": 3730 + }, + { + "ce_loss": 0.3604081869125366, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.4917229413986206, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.24488778412342072, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 1.1799731254577637, + "step": 3730 + }, + { + "ce_loss": 0.38218820095062256, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.44150495529174805, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.26581379771232605, + "step": 3730 + }, + { + "epoch": 1.247498332221481, + "loss": 1.2774, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "grad_norm": 1.8356317281723022, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "learning_rate": 0.0007813203769288405, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 1.0768628120422363, + "step": 3740 + }, + { + "ce_loss": 0.3683629035949707, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.448332279920578, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.25920337438583374, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 1.0234906673431396, + "step": 3740 + }, + { + "ce_loss": 0.29681557416915894, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.4987185597419739, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.22705894708633423, + "step": 3740 + }, + { + "epoch": 1.2508338892595063, + "loss": 1.2152, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "grad_norm": 3.1180875301361084, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "learning_rate": 0.0007811896413348068, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 1.5606341361999512, + "step": 3750 + }, + { + "ce_loss": 0.3866245448589325, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.48672977089881897, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.26517099142074585, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 1.093451738357544, + "step": 3750 + }, + { + "ce_loss": 0.38561955094337463, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.4325929284095764, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.2735663652420044, + "step": 3750 + }, + { + "epoch": 1.2541694462975317, + "loss": 1.2702, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "grad_norm": 1.780735731124878, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "learning_rate": 0.0007810584608514438, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 1.186219334602356, + "step": 3760 + }, + { + "ce_loss": 0.3113108277320862, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.4651794135570526, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.21116314828395844, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 1.305797815322876, + "step": 3760 + }, + { + "ce_loss": 0.40197962522506714, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.5337285995483398, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.2919960916042328, + "step": 3760 + }, + { + "epoch": 1.257505003335557, + "loss": 1.287, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "grad_norm": 1.7766234874725342, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "learning_rate": 0.0007809268356318535, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 1.434028148651123, + "step": 3770 + }, + { + "ce_loss": 0.3346835970878601, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.43117108941078186, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.2201913595199585, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 0.8691766858100891, + "step": 3770 + }, + { + "ce_loss": 0.2524417042732239, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.3669566512107849, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.24963033199310303, + "step": 3770 + }, + { + "epoch": 1.2608405603735824, + "loss": 1.1919, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "grad_norm": 2.417431116104126, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "learning_rate": 0.0007807947658296564, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 1.0297123193740845, + "step": 3780 + }, + { + "ce_loss": 0.31283092498779297, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.47859102487564087, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.2380753755569458, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 1.102510690689087, + "step": 3780 + }, + { + "ce_loss": 0.3200535774230957, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.5070088505744934, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.2310090810060501, + "step": 3780 + }, + { + "epoch": 1.2641761174116077, + "loss": 1.2351, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "grad_norm": 2.5228090286254883, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "learning_rate": 0.0007806622515989926, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 1.2249534130096436, + "step": 3790 + }, + { + "ce_loss": 0.3235895335674286, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.5146533846855164, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.30214163661003113, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 1.6786766052246094, + "step": 3790 + }, + { + "ce_loss": 0.3835620582103729, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.7029944658279419, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.2777293920516968, + "step": 3790 + }, + { + "epoch": 1.267511674449633, + "loss": 1.2841, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "grad_norm": 1.3752113580703735, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "learning_rate": 0.0007805292930945202, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 1.2774758338928223, + "step": 3800 + }, + { + "ce_loss": 0.34269601106643677, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.5056908130645752, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.22918358445167542, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 0.9431825280189514, + "step": 3800 + }, + { + "ce_loss": 0.29880571365356445, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.4647649824619293, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.17928951978683472, + "step": 3800 + }, + { + "epoch": 1.2708472314876584, + "loss": 1.332, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "grad_norm": 2.091035842895508, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "learning_rate": 0.0007803958904714159, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 1.3321853876113892, + "step": 3810 + }, + { + "ce_loss": 0.35707196593284607, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.5309280157089233, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.2696284353733063, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 1.3040084838867188, + "step": 3810 + }, + { + "ce_loss": 0.3999079465866089, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.46109065413475037, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.29647043347358704, + "step": 3810 + }, + { + "epoch": 1.2741827885256838, + "loss": 1.2456, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "grad_norm": 1.902409315109253, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "learning_rate": 0.0007802620438853754, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 1.62610924243927, + "step": 3820 + }, + { + "ce_loss": 0.40455368161201477, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.5282254815101624, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.2308456301689148, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 1.2230732440948486, + "step": 3820 + }, + { + "ce_loss": 0.34106773138046265, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.5421730279922485, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.23581770062446594, + "step": 3820 + }, + { + "epoch": 1.2775183455637091, + "loss": 1.3283, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "grad_norm": 1.6869583129882812, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "learning_rate": 0.0007801277534926117, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 1.173782467842102, + "step": 3830 + }, + { + "ce_loss": 0.3969452381134033, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.46130332350730896, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.2590175271034241, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 1.344570517539978, + "step": 3830 + }, + { + "ce_loss": 0.38278263807296753, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.5976495742797852, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.3041939437389374, + "step": 3830 + }, + { + "epoch": 1.2808539026017345, + "loss": 1.1972, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "grad_norm": 2.1180238723754883, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "learning_rate": 0.0007799930194498561, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 1.1536294221878052, + "step": 3840 + }, + { + "ce_loss": 0.3288697600364685, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.39680665731430054, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.32831668853759766, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 1.2123440504074097, + "step": 3840 + }, + { + "ce_loss": 0.3592081367969513, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.5434111952781677, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.21991020441055298, + "step": 3840 + }, + { + "epoch": 1.2841894596397598, + "loss": 1.2067, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "grad_norm": 1.927894949913025, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "learning_rate": 0.0007798578419143581, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 1.3616065979003906, + "step": 3850 + }, + { + "ce_loss": 0.4312002658843994, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.5158619284629822, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.33172714710235596, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 1.196885347366333, + "step": 3850 + }, + { + "ce_loss": 0.25604671239852905, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.47364771366119385, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.2535742521286011, + "step": 3850 + }, + { + "epoch": 1.2875250166777852, + "loss": 1.3678, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "grad_norm": 2.626359701156616, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "learning_rate": 0.000779722221043884, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 1.3988009691238403, + "step": 3860 + }, + { + "ce_loss": 0.32167813181877136, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.539366602897644, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.23677317798137665, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 1.2158448696136475, + "step": 3860 + }, + { + "ce_loss": 0.37974071502685547, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.5115886926651001, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.3241943418979645, + "step": 3860 + }, + { + "epoch": 1.2908605737158105, + "loss": 1.3151, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "grad_norm": 1.8815953731536865, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "learning_rate": 0.0007795861569967182, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 1.1867594718933105, + "step": 3870 + }, + { + "ce_loss": 0.3901292383670807, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.4555829167366028, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.29295268654823303, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 1.3478776216506958, + "step": 3870 + }, + { + "ce_loss": 0.35403814911842346, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.4828459322452545, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.29592955112457275, + "step": 3870 + }, + { + "epoch": 1.2941961307538359, + "loss": 1.2429, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "grad_norm": 1.8783077001571655, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "learning_rate": 0.0007794496499316621, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 0.9118102192878723, + "step": 3880 + }, + { + "ce_loss": 0.22861425578594208, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.39684969186782837, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.19896776974201202, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 1.749941110610962, + "step": 3880 + }, + { + "ce_loss": 0.4143109619617462, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.625407338142395, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.2498316466808319, + "step": 3880 + }, + { + "epoch": 1.2975316877918612, + "loss": 1.2324, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "grad_norm": 1.451900601387024, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "learning_rate": 0.000779312700008034, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 1.0057792663574219, + "step": 3890 + }, + { + "ce_loss": 0.33447906374931335, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.43524765968322754, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.23554539680480957, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 1.3061044216156006, + "step": 3890 + }, + { + "ce_loss": 0.35017919540405273, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.4204780161380768, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.2741265892982483, + "step": 3890 + }, + { + "epoch": 1.3008672448298866, + "loss": 1.2003, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "grad_norm": 2.9234039783477783, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "learning_rate": 0.0007791753073856692, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 1.3480119705200195, + "step": 3900 + }, + { + "ce_loss": 0.3394612669944763, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.6371109485626221, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.29357367753982544, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 0.9949564337730408, + "step": 3900 + }, + { + "ce_loss": 0.2775047719478607, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.5102842450141907, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.20661421120166779, + "step": 3900 + }, + { + "epoch": 1.304202801867912, + "loss": 1.2634, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "grad_norm": 1.8522040843963623, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "learning_rate": 0.0007790374722249198, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 1.3082270622253418, + "step": 3910 + }, + { + "ce_loss": 0.3670852780342102, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.5673364996910095, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.29259952902793884, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 0.9135763645172119, + "step": 3910 + }, + { + "ce_loss": 0.26060348749160767, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.45228105783462524, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.19999496638774872, + "step": 3910 + }, + { + "epoch": 1.3075383589059373, + "loss": 1.2096, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "grad_norm": 2.51267409324646, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "learning_rate": 0.0007788991946866542, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 1.2026423215866089, + "step": 3920 + }, + { + "ce_loss": 0.32737037539482117, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.5024385452270508, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.22580769658088684, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 1.123530387878418, + "step": 3920 + }, + { + "ce_loss": 0.37271732091903687, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.4684728682041168, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.2198222279548645, + "step": 3920 + }, + { + "epoch": 1.3108739159439626, + "loss": 1.1637, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "grad_norm": 1.9118099212646484, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "learning_rate": 0.000778760474932257, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 1.0190352201461792, + "step": 3930 + }, + { + "ce_loss": 0.31040164828300476, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.3947071433067322, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.247438445687294, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 1.3314695358276367, + "step": 3930 + }, + { + "ce_loss": 0.27209722995758057, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.4830207824707031, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.2210894525051117, + "step": 3930 + }, + { + "epoch": 1.314209472981988, + "loss": 1.2585, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "grad_norm": 1.4486713409423828, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "learning_rate": 0.0007786213131236294, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 0.9980438947677612, + "step": 3940 + }, + { + "ce_loss": 0.3219510614871979, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.47861647605895996, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.19660653173923492, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 1.198689341545105, + "step": 3940 + }, + { + "ce_loss": 0.26567214727401733, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.43696823716163635, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.20686624944210052, + "step": 3940 + }, + { + "epoch": 1.3175450300200133, + "loss": 1.2143, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "grad_norm": 1.840827226638794, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "learning_rate": 0.000778481709423188, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 1.210796594619751, + "step": 3950 + }, + { + "ce_loss": 0.35644543170928955, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.5447015762329102, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.2510436177253723, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 1.4380931854248047, + "step": 3950 + }, + { + "ce_loss": 0.2858380675315857, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.5480408072471619, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.24802200496196747, + "step": 3950 + }, + { + "epoch": 1.3208805870580387, + "loss": 1.3879, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "grad_norm": 1.9232362508773804, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "learning_rate": 0.0007783416639938654, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.9122708439826965, + "step": 3960 + }, + { + "ce_loss": 0.23479680716991425, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.4136393070220947, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.17437537014484406, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 1.3577370643615723, + "step": 3960 + }, + { + "ce_loss": 0.32100340723991394, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.4907395839691162, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.21169336140155792, + "step": 3960 + }, + { + "epoch": 1.324216144096064, + "loss": 1.2768, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "grad_norm": 1.9105011224746704, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "learning_rate": 0.0007782011769991097, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 0.9515153765678406, + "step": 3970 + }, + { + "ce_loss": 0.2723129391670227, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.4680030345916748, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.2104032188653946, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 1.3177235126495361, + "step": 3970 + }, + { + "ce_loss": 0.28969529271125793, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.4438014030456543, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.2746254503726959, + "step": 3970 + }, + { + "epoch": 1.3275517011340894, + "loss": 1.1935, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "grad_norm": 1.9408024549484253, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "learning_rate": 0.0007780602486028843, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 1.2587913274765015, + "step": 3980 + }, + { + "ce_loss": 0.3765130639076233, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.5410686135292053, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.24707219004631042, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 1.5994065999984741, + "step": 3980 + }, + { + "ce_loss": 0.4570700228214264, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.6066403388977051, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.2937600612640381, + "step": 3980 + }, + { + "epoch": 1.3308872581721147, + "loss": 1.2621, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "grad_norm": 1.874920129776001, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "learning_rate": 0.0007779188789696677, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 1.5024466514587402, + "step": 3990 + }, + { + "ce_loss": 0.4218442142009735, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.49272358417510986, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.30377674102783203, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 1.033069133758545, + "step": 3990 + }, + { + "ce_loss": 0.29926007986068726, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.4381902813911438, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.295410692691803, + "step": 3990 + }, + { + "epoch": 1.33422281521014, + "loss": 1.2749, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "grad_norm": 2.6915059089660645, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "learning_rate": 0.0007777770682644537, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 1.3868048191070557, + "step": 4000 + }, + { + "ce_loss": 0.3571189343929291, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.634859561920166, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.2819118797779083, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 1.1737803220748901, + "step": 4000 + }, + { + "ce_loss": 0.3412657082080841, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.5893285274505615, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.24275919795036316, + "step": 4000 + }, + { + "epoch": 1.3375583722481654, + "loss": 1.3882, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "grad_norm": 1.72599196434021, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "learning_rate": 0.0007776348166527506, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 1.196636438369751, + "step": 4010 + }, + { + "ce_loss": 0.31943756341934204, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.5612338781356812, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.19274453818798065, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 1.206289291381836, + "step": 4010 + }, + { + "ce_loss": 0.3338809907436371, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.5372907519340515, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.24270124733448029, + "step": 4010 + }, + { + "epoch": 1.3408939292861908, + "loss": 1.2553, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "grad_norm": 1.5856388807296753, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "learning_rate": 0.0007774921243005812, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 1.261049509048462, + "step": 4020 + }, + { + "ce_loss": 0.32735010981559753, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.406891405582428, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.26967743039131165, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 1.6279792785644531, + "step": 4020 + }, + { + "ce_loss": 0.33251306414604187, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.45015233755111694, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.2929288446903229, + "step": 4020 + }, + { + "epoch": 1.3442294863242161, + "loss": 1.3001, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "grad_norm": 1.8929225206375122, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "learning_rate": 0.0007773489913744829, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 1.1194690465927124, + "step": 4030 + }, + { + "ce_loss": 0.2566942572593689, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.45145970582962036, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.22091078758239746, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 1.2250628471374512, + "step": 4030 + }, + { + "ce_loss": 0.3045845329761505, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.5682451128959656, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.19009850919246674, + "step": 4030 + }, + { + "epoch": 1.3475650433622415, + "loss": 1.3624, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "grad_norm": 2.1252081394195557, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "learning_rate": 0.0007772054180415072, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.9570633769035339, + "step": 4040 + }, + { + "ce_loss": 0.27775803208351135, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.4150533676147461, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.18391333520412445, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 1.1152474880218506, + "step": 4040 + }, + { + "ce_loss": 0.3719904124736786, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.479840487241745, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.2096468210220337, + "step": 4040 + }, + { + "epoch": 1.3509006004002668, + "loss": 1.2063, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "grad_norm": 1.6501506567001343, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "learning_rate": 0.0007770614044692197, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.9688602089881897, + "step": 4050 + }, + { + "ce_loss": 0.2844759523868561, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.48957210779190063, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.1945677399635315, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.977230429649353, + "step": 4050 + }, + { + "ce_loss": 0.2928234934806824, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.47115814685821533, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.21297568082809448, + "step": 4050 + }, + { + "epoch": 1.3542361574382922, + "loss": 1.2671, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "grad_norm": 1.6580679416656494, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "learning_rate": 0.0007769169508256998, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 1.6507563591003418, + "step": 4060 + }, + { + "ce_loss": 0.3336380124092102, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.44040024280548096, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.2393060177564621, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 1.0483828783035278, + "step": 4060 + }, + { + "ce_loss": 0.3106835186481476, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.4723849296569824, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.19641432166099548, + "step": 4060 + }, + { + "epoch": 1.3575717144763175, + "loss": 1.2578, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "grad_norm": 1.6928682327270508, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "learning_rate": 0.0007767720572795402, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 1.1844429969787598, + "step": 4070 + }, + { + "ce_loss": 0.32700392603874207, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.5141096711158752, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.22966568171977997, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 1.1436493396759033, + "step": 4070 + }, + { + "ce_loss": 0.28880831599235535, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.4775233566761017, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.22410236299037933, + "step": 4070 + }, + { + "epoch": 1.3609072715143429, + "loss": 1.3958, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "grad_norm": 2.11891770362854, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "learning_rate": 0.0007766267239998474, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 1.4525783061981201, + "step": 4080 + }, + { + "ce_loss": 0.3727923631668091, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.5707982778549194, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.2720719873905182, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 1.2829095125198364, + "step": 4080 + }, + { + "ce_loss": 0.2625315487384796, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.5479893684387207, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.2146390825510025, + "step": 4080 + }, + { + "epoch": 1.3642428285523682, + "loss": 1.2783, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "grad_norm": 1.5971099138259888, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "learning_rate": 0.000776480951156241, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 1.1137681007385254, + "step": 4090 + }, + { + "ce_loss": 0.31989777088165283, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.46096158027648926, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.2357354760169983, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 1.3032519817352295, + "step": 4090 + }, + { + "ce_loss": 0.41542813181877136, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.5394437313079834, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.25867047905921936, + "step": 4090 + }, + { + "epoch": 1.3675783855903936, + "loss": 1.2893, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "grad_norm": 2.2129158973693848, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "learning_rate": 0.0007763347389188538, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 1.1820653676986694, + "step": 4100 + }, + { + "ce_loss": 0.3743279278278351, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.46733152866363525, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.27772408723831177, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 1.2491978406906128, + "step": 4100 + }, + { + "ce_loss": 0.3634282946586609, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.5043824911117554, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.222568079829216, + "step": 4100 + }, + { + "epoch": 1.370913942628419, + "loss": 1.2623, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "grad_norm": 2.6276814937591553, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "learning_rate": 0.0007761880874583308, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 0.9971789121627808, + "step": 4110 + }, + { + "ce_loss": 0.2905887961387634, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.4974335730075836, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.20612338185310364, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 1.457338809967041, + "step": 4110 + }, + { + "ce_loss": 0.3569025993347168, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.5805869102478027, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.25414037704467773, + "step": 4110 + }, + { + "epoch": 1.3742494996664443, + "loss": 1.1611, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "grad_norm": 1.3242756128311157, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "learning_rate": 0.0007760409969458301, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 1.0695313215255737, + "step": 4120 + }, + { + "ce_loss": 0.3189639747142792, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.40805482864379883, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.19657932221889496, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 0.9379848837852478, + "step": 4120 + }, + { + "ce_loss": 0.2650635242462158, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.4356876313686371, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.2354866862297058, + "step": 4120 + }, + { + "epoch": 1.3775850567044696, + "loss": 1.2268, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "grad_norm": 1.7438019514083862, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "learning_rate": 0.0007758934675530224, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 1.2131264209747314, + "step": 4130 + }, + { + "ce_loss": 0.36638134717941284, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.4722328186035156, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.3309879004955292, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 1.2295116186141968, + "step": 4130 + }, + { + "ce_loss": 0.34632059931755066, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.4670564532279968, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.26047638058662415, + "step": 4130 + }, + { + "epoch": 1.380920613742495, + "loss": 1.1731, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "grad_norm": 1.7917325496673584, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "learning_rate": 0.0007757454994520902, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 1.2207962274551392, + "step": 4140 + }, + { + "ce_loss": 0.3304611146450043, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.4229097366333008, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.2661569118499756, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 1.1358577013015747, + "step": 4140 + }, + { + "ce_loss": 0.32615402340888977, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.44644737243652344, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.2777961790561676, + "step": 4140 + }, + { + "epoch": 1.3842561707805203, + "loss": 1.187, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "grad_norm": 1.9539581537246704, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "learning_rate": 0.0007755970928157282, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 1.2930634021759033, + "step": 4150 + }, + { + "ce_loss": 0.3514963984489441, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.5322245955467224, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.1809874325990677, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.8671672940254211, + "step": 4150 + }, + { + "ce_loss": 0.27406421303749084, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.3994680643081665, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.19340065121650696, + "step": 4150 + }, + { + "epoch": 1.3875917278185457, + "loss": 1.1675, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "grad_norm": 1.6229897737503052, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "learning_rate": 0.0007754482478171432, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 1.1500556468963623, + "step": 4160 + }, + { + "ce_loss": 0.32584086060523987, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.36228689551353455, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.22661544382572174, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 1.1648491621017456, + "step": 4160 + }, + { + "ce_loss": 0.370568186044693, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.4062316417694092, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.284487783908844, + "step": 4160 + }, + { + "epoch": 1.390927284856571, + "loss": 1.1423, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "grad_norm": 2.137101888656616, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "learning_rate": 0.0007752989646300529, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 1.123513102531433, + "step": 4170 + }, + { + "ce_loss": 0.3663288950920105, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.46327751874923706, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.27190497517585754, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 1.4627983570098877, + "step": 4170 + }, + { + "ce_loss": 0.3249722421169281, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.4730352461338043, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.20529697835445404, + "step": 4170 + }, + { + "epoch": 1.3942628418945964, + "loss": 1.1578, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "grad_norm": 1.8561530113220215, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "learning_rate": 0.0007751492434286872, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 1.6327110528945923, + "step": 4180 + }, + { + "ce_loss": 0.23420818150043488, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.3671976625919342, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.20780381560325623, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 1.0552396774291992, + "step": 4180 + }, + { + "ce_loss": 0.2987080216407776, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.41605520248413086, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.1913209706544876, + "step": 4180 + }, + { + "epoch": 1.3975983989326217, + "loss": 1.1349, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "grad_norm": 1.696298599243164, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "learning_rate": 0.0007749990843877865, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 1.2016475200653076, + "step": 4190 + }, + { + "ce_loss": 0.3398931622505188, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.43274614214897156, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.2609109878540039, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 1.0881268978118896, + "step": 4190 + }, + { + "ce_loss": 0.3539738357067108, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.43918776512145996, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.1944909542798996, + "step": 4190 + }, + { + "epoch": 1.400933955970647, + "loss": 1.2285, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "grad_norm": 1.5924049615859985, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "learning_rate": 0.0007748484876826028, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 1.4822578430175781, + "step": 4200 + }, + { + "ce_loss": 0.28238165378570557, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.37509191036224365, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.21833594143390656, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 1.3037891387939453, + "step": 4200 + }, + { + "ce_loss": 0.3134443759918213, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.5113526582717896, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.2542562186717987, + "step": 4200 + }, + { + "epoch": 1.4042695130086724, + "loss": 1.2271, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "grad_norm": 1.8334202766418457, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "learning_rate": 0.0007746974534888986, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 1.1463003158569336, + "step": 4210 + }, + { + "ce_loss": 0.3059764504432678, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.47663551568984985, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.27451011538505554, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 1.2052125930786133, + "step": 4210 + }, + { + "ce_loss": 0.3422890305519104, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.5845454931259155, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.19704946875572205, + "step": 4210 + }, + { + "epoch": 1.4076050700466978, + "loss": 1.1701, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "grad_norm": 1.990908145904541, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "learning_rate": 0.0007745459819829473, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 1.103863000869751, + "step": 4220 + }, + { + "ce_loss": 0.27677813172340393, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.4710143208503723, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.17940568923950195, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 1.1673400402069092, + "step": 4220 + }, + { + "ce_loss": 0.3511430025100708, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.5159322023391724, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.24053901433944702, + "step": 4220 + }, + { + "epoch": 1.4109406270847231, + "loss": 1.2512, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "grad_norm": 1.604972004890442, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "learning_rate": 0.0007743940733415319, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 1.0855860710144043, + "step": 4230 + }, + { + "ce_loss": 0.3288971781730652, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.4643997848033905, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.20832104980945587, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 1.439693808555603, + "step": 4230 + }, + { + "ce_loss": 0.34634625911712646, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.5309983491897583, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.23617428541183472, + "step": 4230 + }, + { + "epoch": 1.4142761841227485, + "loss": 1.1937, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "grad_norm": 1.9160957336425781, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "learning_rate": 0.0007742417277419465, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 1.0655322074890137, + "step": 4240 + }, + { + "ce_loss": 0.2940945625305176, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.4044458568096161, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.20413602888584137, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 1.7881062030792236, + "step": 4240 + }, + { + "ce_loss": 0.46321895718574524, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.5222978591918945, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.2899708151817322, + "step": 4240 + }, + { + "epoch": 1.4176117411607738, + "loss": 1.2468, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "grad_norm": 2.470942735671997, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "learning_rate": 0.0007740889453619949, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 1.334357500076294, + "step": 4250 + }, + { + "ce_loss": 0.4395330250263214, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.5255869030952454, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.2873314321041107, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 1.2778472900390625, + "step": 4250 + }, + { + "ce_loss": 0.33936822414398193, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.44710713624954224, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.2442135512828827, + "step": 4250 + }, + { + "epoch": 1.4209472981987992, + "loss": 1.1803, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "grad_norm": 2.082793951034546, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "learning_rate": 0.0007739357263799903, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 1.5148963928222656, + "step": 4260 + }, + { + "ce_loss": 0.42286428809165955, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.4865938127040863, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.2644711434841156, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 1.1568526029586792, + "step": 4260 + }, + { + "ce_loss": 0.363336980342865, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.4715014100074768, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.25636565685272217, + "step": 4260 + }, + { + "epoch": 1.4242828552368245, + "loss": 1.2529, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "grad_norm": 1.6873589754104614, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "learning_rate": 0.0007737820709747559, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 1.147404432296753, + "step": 4270 + }, + { + "ce_loss": 0.3248310685157776, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.4509952664375305, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.24929717183113098, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 1.0453346967697144, + "step": 4270 + }, + { + "ce_loss": 0.3251522183418274, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.4964909553527832, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.2225407361984253, + "step": 4270 + }, + { + "epoch": 1.4276184122748499, + "loss": 1.2289, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "grad_norm": 2.059302806854248, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "learning_rate": 0.0007736279793256241, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 1.9120283126831055, + "step": 4280 + }, + { + "ce_loss": 0.37467390298843384, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.47755178809165955, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.29072806239128113, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 2.3525896072387695, + "step": 4280 + }, + { + "ce_loss": 0.3748549818992615, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.42405492067337036, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.2544288635253906, + "step": 4280 + }, + { + "epoch": 1.4309539693128752, + "loss": 1.3413, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "grad_norm": 1.8155747652053833, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "learning_rate": 0.0007734734516124362, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 1.2981414794921875, + "step": 4290 + }, + { + "ce_loss": 0.40608739852905273, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.5454487204551697, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.27907249331474304, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.9883788228034973, + "step": 4290 + }, + { + "ce_loss": 0.3114151656627655, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.42436087131500244, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.2510216236114502, + "step": 4290 + }, + { + "epoch": 1.4342895263509006, + "loss": 1.3547, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "grad_norm": 2.8617186546325684, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "learning_rate": 0.0007733184880155431, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 1.3844374418258667, + "step": 4300 + }, + { + "ce_loss": 0.41324886679649353, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.5392626523971558, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.25431501865386963, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 1.012142300605774, + "step": 4300 + }, + { + "ce_loss": 0.2379559874534607, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.425807386636734, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.2007993459701538, + "step": 4300 + }, + { + "epoch": 1.437625083388926, + "loss": 1.1946, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "grad_norm": 1.6949338912963867, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "learning_rate": 0.0007731630887158037, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 1.02177095413208, + "step": 4310 + }, + { + "ce_loss": 0.24608014523983002, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.45627400279045105, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.20076674222946167, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 1.084315299987793, + "step": 4310 + }, + { + "ce_loss": 0.3126720190048218, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.4719497561454773, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.22649230062961578, + "step": 4310 + }, + { + "epoch": 1.4409606404269513, + "loss": 1.2592, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "grad_norm": 2.882329225540161, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "learning_rate": 0.0007730072538945857, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 1.1591767072677612, + "step": 4320 + }, + { + "ce_loss": 0.36486318707466125, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.5568551421165466, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.23736506700515747, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 1.2775168418884277, + "step": 4320 + }, + { + "ce_loss": 0.42621615529060364, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.5392773151397705, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.311714231967926, + "step": 4320 + }, + { + "epoch": 1.4442961974649766, + "loss": 1.201, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "grad_norm": 3.2554428577423096, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "learning_rate": 0.0007728509837337652, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 2.2309839725494385, + "step": 4330 + }, + { + "ce_loss": 0.3380087614059448, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.5450072884559631, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.23968921601772308, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 1.2512460947036743, + "step": 4330 + }, + { + "ce_loss": 0.3639287054538727, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.5350576043128967, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.2919241189956665, + "step": 4330 + }, + { + "epoch": 1.447631754503002, + "loss": 1.2924, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "grad_norm": 2.581834077835083, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "learning_rate": 0.0007726942784157262, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 1.748133659362793, + "step": 4340 + }, + { + "ce_loss": 0.476225346326828, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.6821185350418091, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.2749059796333313, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 1.231296181678772, + "step": 4340 + }, + { + "ce_loss": 0.362838476896286, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.5025181770324707, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.28818202018737793, + "step": 4340 + }, + { + "epoch": 1.4509673115410273, + "loss": 1.2941, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "grad_norm": 1.7488747835159302, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "learning_rate": 0.0007725371381233607, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 1.2100708484649658, + "step": 4350 + }, + { + "ce_loss": 0.33391791582107544, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.46951207518577576, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.23552538454532623, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 1.1646292209625244, + "step": 4350 + }, + { + "ce_loss": 0.3197769820690155, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.4967893660068512, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.20482869446277618, + "step": 4350 + }, + { + "epoch": 1.4543028685790527, + "loss": 1.2345, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "grad_norm": 1.8481858968734741, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "learning_rate": 0.0007723795630400686, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 1.1097173690795898, + "step": 4360 + }, + { + "ce_loss": 0.27734866738319397, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.43501776456832886, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.20296035706996918, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 1.2077794075012207, + "step": 4360 + }, + { + "ce_loss": 0.3112337291240692, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.5319333076477051, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.21200990676879883, + "step": 4360 + }, + { + "epoch": 1.457638425617078, + "loss": 1.2801, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "grad_norm": 1.7149287462234497, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "learning_rate": 0.0007722215533497566, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 1.0904030799865723, + "step": 4370 + }, + { + "ce_loss": 0.2962927222251892, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.5173056125640869, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.21020416915416718, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 1.0985056161880493, + "step": 4370 + }, + { + "ce_loss": 0.27120473980903625, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.4675452709197998, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.22813116014003754, + "step": 4370 + }, + { + "epoch": 1.4609739826551034, + "loss": 1.256, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "grad_norm": 2.0012896060943604, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "learning_rate": 0.000772063109236839, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 1.459157943725586, + "step": 4380 + }, + { + "ce_loss": 0.23730525374412537, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.5278077125549316, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.1980505734682083, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 1.243227243423462, + "step": 4380 + }, + { + "ce_loss": 0.3660188913345337, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.6111223697662354, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.26495978236198425, + "step": 4380 + }, + { + "epoch": 1.4643095396931287, + "loss": 1.196, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "grad_norm": 1.796673059463501, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "learning_rate": 0.0007719042308862374, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 1.1504383087158203, + "step": 4390 + }, + { + "ce_loss": 0.28149980306625366, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.5336781144142151, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.22772520780563354, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 1.1730796098709106, + "step": 4390 + }, + { + "ce_loss": 0.2699909806251526, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.5290112495422363, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.25511685013771057, + "step": 4390 + }, + { + "epoch": 1.467645096731154, + "loss": 1.2889, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "grad_norm": 2.082970380783081, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "learning_rate": 0.0007717449184833797, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 1.255260705947876, + "step": 4400 + }, + { + "ce_loss": 0.36189717054367065, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.4005478322505951, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.25275516510009766, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 1.2640717029571533, + "step": 4400 + }, + { + "ce_loss": 0.36174458265304565, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.4172531068325043, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.2263191193342209, + "step": 4400 + }, + { + "epoch": 1.4709806537691794, + "loss": 1.2228, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "grad_norm": 1.754824161529541, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "learning_rate": 0.0007715851722142008, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 1.3902934789657593, + "step": 4410 + }, + { + "ce_loss": 0.37669166922569275, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.5949977040290833, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.3123665153980255, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 1.2151596546173096, + "step": 4410 + }, + { + "ce_loss": 0.32400956749916077, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.4897102415561676, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.20092438161373138, + "step": 4410 + }, + { + "epoch": 1.4743162108072048, + "loss": 1.2723, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "grad_norm": 2.524966239929199, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "learning_rate": 0.0007714249922651417, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 1.3874589204788208, + "step": 4420 + }, + { + "ce_loss": 0.30010318756103516, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.5788282752037048, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.230285182595253, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 1.1097252368927002, + "step": 4420 + }, + { + "ce_loss": 0.2605561316013336, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.46707484126091003, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.19243906438350677, + "step": 4420 + }, + { + "epoch": 1.4776517678452301, + "loss": 1.2361, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "grad_norm": 2.02272629737854, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "learning_rate": 0.0007712643788231496, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 1.1122024059295654, + "step": 4430 + }, + { + "ce_loss": 0.3622915744781494, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.46392694115638733, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.20297808945178986, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 1.1137298345565796, + "step": 4430 + }, + { + "ce_loss": 0.3511255085468292, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.4177210330963135, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.29275891184806824, + "step": 4430 + }, + { + "epoch": 1.4809873248832555, + "loss": 1.3339, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "grad_norm": 2.513075828552246, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "learning_rate": 0.0007711033320756778, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 1.3569178581237793, + "step": 4440 + }, + { + "ce_loss": 0.27129077911376953, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.44302868843078613, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.20074187219142914, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 1.2615227699279785, + "step": 4440 + }, + { + "ce_loss": 0.3332144021987915, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.49916476011276245, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.1940786987543106, + "step": 4440 + }, + { + "epoch": 1.4843228819212808, + "loss": 1.2713, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "grad_norm": 1.484968662261963, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "learning_rate": 0.0007709418522106851, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 1.2872264385223389, + "step": 4450 + }, + { + "ce_loss": 0.4188542664051056, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.5292216539382935, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.25378894805908203, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 1.5678834915161133, + "step": 4450 + }, + { + "ce_loss": 0.3771999478340149, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.6054632663726807, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.24410036206245422, + "step": 4450 + }, + { + "epoch": 1.4876584389593062, + "loss": 1.3639, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "grad_norm": 1.744352102279663, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "learning_rate": 0.0007707799394166358, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 1.6583737134933472, + "step": 4460 + }, + { + "ce_loss": 0.35331714153289795, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.4476446211338043, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.267397940158844, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 1.1916965246200562, + "step": 4460 + }, + { + "ce_loss": 0.3574371337890625, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.546086311340332, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.2879451811313629, + "step": 4460 + }, + { + "epoch": 1.4909939959973315, + "loss": 1.2757, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "grad_norm": 3.38885498046875, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "learning_rate": 0.0007706175938824996, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 1.2496098279953003, + "step": 4470 + }, + { + "ce_loss": 0.3497420847415924, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.5570492744445801, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.2517758309841156, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.9816231727600098, + "step": 4470 + }, + { + "ce_loss": 0.3141824007034302, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.46462568640708923, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.20259299874305725, + "step": 4470 + }, + { + "epoch": 1.4943295530353569, + "loss": 1.1332, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "grad_norm": 2.002901554107666, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "learning_rate": 0.0007704548157977514, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 1.007076382637024, + "step": 4480 + }, + { + "ce_loss": 0.29891398549079895, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.49870485067367554, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.20919978618621826, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 1.2025349140167236, + "step": 4480 + }, + { + "ce_loss": 0.3246881663799286, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.4924752712249756, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.2761264741420746, + "step": 4480 + }, + { + "epoch": 1.4976651100733822, + "loss": 1.2163, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "grad_norm": 1.9824186563491821, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "learning_rate": 0.0007702916053523705, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 1.797337532043457, + "step": 4490 + }, + { + "ce_loss": 0.40063029527664185, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.4892476201057434, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.2173793464899063, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 1.2050611972808838, + "step": 4490 + }, + { + "ce_loss": 0.3333210349082947, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.44667676091194153, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.22997619211673737, + "step": 4490 + }, + { + "epoch": 1.5010006671114076, + "loss": 1.295, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "grad_norm": 2.122441053390503, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "learning_rate": 0.0007701279627368411, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 1.2669577598571777, + "step": 4500 + }, + { + "ce_loss": 0.35459598898887634, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.5124510526657104, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.24606196582317352, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 1.0956400632858276, + "step": 4500 + }, + { + "ce_loss": 0.30307912826538086, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.5664011240005493, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.22324898838996887, + "step": 4500 + }, + { + "epoch": 1.504336224149433, + "loss": 1.2063, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "grad_norm": 2.289616346359253, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "learning_rate": 0.0007699638881421518, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 1.3379403352737427, + "step": 4510 + }, + { + "ce_loss": 0.3895055651664734, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.549110472202301, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.33225470781326294, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 0.9041282534599304, + "step": 4510 + }, + { + "ce_loss": 0.2577150762081146, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.4566647410392761, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.18841959536075592, + "step": 4510 + }, + { + "epoch": 1.5076717811874583, + "loss": 1.2233, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "grad_norm": 9.080262184143066, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "learning_rate": 0.0007697993817597952, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 1.5081042051315308, + "step": 4520 + }, + { + "ce_loss": 0.5217739939689636, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.6341196298599243, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.35214540362358093, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 2.01505446434021, + "step": 4520 + }, + { + "ce_loss": 0.4766079783439636, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.5420026779174805, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.27477625012397766, + "step": 4520 + }, + { + "epoch": 1.5110073382254836, + "loss": 1.3597, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "grad_norm": 3.787122964859009, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "learning_rate": 0.0007696344437817681, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 1.5112301111221313, + "step": 4530 + }, + { + "ce_loss": 0.3507489264011383, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.5297791361808777, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.23945032060146332, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 1.102178692817688, + "step": 4530 + }, + { + "ce_loss": 0.3480914235115051, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.5078696012496948, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.24594002962112427, + "step": 4530 + }, + { + "epoch": 1.514342895263509, + "loss": 1.2106, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "grad_norm": 1.8164446353912354, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "learning_rate": 0.0007694690744005707, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 1.1036242246627808, + "step": 4540 + }, + { + "ce_loss": 0.3030671179294586, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.5098708868026733, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.2081867754459381, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 1.0559051036834717, + "step": 4540 + }, + { + "ce_loss": 0.2896508276462555, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.4736049771308899, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.1540573388338089, + "step": 4540 + }, + { + "epoch": 1.5176784523015343, + "loss": 1.2321, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "grad_norm": 1.8194334506988525, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "learning_rate": 0.000769303273809207, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 1.0852607488632202, + "step": 4550 + }, + { + "ce_loss": 0.30452650785446167, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.44049713015556335, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.20696301758289337, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 1.345840334892273, + "step": 4550 + }, + { + "ce_loss": 0.22905874252319336, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.5314184427261353, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.19444139301776886, + "step": 4550 + }, + { + "epoch": 1.5210140093395597, + "loss": 1.2086, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "grad_norm": 2.125244140625, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "learning_rate": 0.0007691370422011842, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 1.3244200944900513, + "step": 4560 + }, + { + "ce_loss": 0.2872719168663025, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.42128679156303406, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.25759992003440857, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 1.0789515972137451, + "step": 4560 + }, + { + "ce_loss": 0.3220730125904083, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.4512641429901123, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.23262079060077667, + "step": 4560 + }, + { + "epoch": 1.524349566377585, + "loss": 1.1744, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "grad_norm": 1.373600721359253, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "learning_rate": 0.0007689703797705122, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 0.9655213952064514, + "step": 4570 + }, + { + "ce_loss": 0.2914479076862335, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.4219920039176941, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.186455637216568, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 1.125288963317871, + "step": 4570 + }, + { + "ce_loss": 0.3586792051792145, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.48464423418045044, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.20336459577083588, + "step": 4570 + }, + { + "epoch": 1.5276851234156104, + "loss": 1.2156, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "grad_norm": 1.8237141370773315, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "learning_rate": 0.0007688032867117043, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 1.5945961475372314, + "step": 4580 + }, + { + "ce_loss": 0.35956457257270813, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.49926865100860596, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.19571208953857422, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 1.1089470386505127, + "step": 4580 + }, + { + "ce_loss": 0.38314369320869446, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.4588963985443115, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.26634490489959717, + "step": 4580 + }, + { + "epoch": 1.5310206804536357, + "loss": 1.235, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "grad_norm": 1.6946427822113037, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "learning_rate": 0.0007686357632197758, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 1.077806830406189, + "step": 4590 + }, + { + "ce_loss": 0.31069520115852356, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.42448943853378296, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.28154149651527405, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.9597331285476685, + "step": 4590 + }, + { + "ce_loss": 0.3159964382648468, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.45832157135009766, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.18516969680786133, + "step": 4590 + }, + { + "epoch": 1.534356237491661, + "loss": 1.2692, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "grad_norm": 3.984046220779419, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "learning_rate": 0.0007684678094902449, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 1.0595605373382568, + "step": 4600 + }, + { + "ce_loss": 0.3087334632873535, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.46225982904434204, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.22030363976955414, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 0.9732711911201477, + "step": 4600 + }, + { + "ce_loss": 0.3353942334651947, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.3921261727809906, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.2052212506532669, + "step": 4600 + }, + { + "epoch": 1.5376917945296864, + "loss": 1.1144, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "grad_norm": 1.938663125038147, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "learning_rate": 0.0007682994257191315, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.9791405200958252, + "step": 4610 + }, + { + "ce_loss": 0.31135374307632446, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.39572346210479736, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.2084600180387497, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 1.0885542631149292, + "step": 4610 + }, + { + "ce_loss": 0.287092924118042, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.4916094243526459, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.2237331122159958, + "step": 4610 + }, + { + "epoch": 1.5410273515677118, + "loss": 1.113, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "grad_norm": 1.4871138334274292, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "learning_rate": 0.0007681306121029575, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 1.2581678628921509, + "step": 4620 + }, + { + "ce_loss": 0.4004390239715576, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.5559272766113281, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.25150373578071594, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 1.5844061374664307, + "step": 4620 + }, + { + "ce_loss": 0.34186816215515137, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.5579522848129272, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.20324687659740448, + "step": 4620 + }, + { + "epoch": 1.544362908605737, + "loss": 1.2223, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "grad_norm": 2.3152780532836914, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "learning_rate": 0.0007679613688387468, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 1.0994701385498047, + "step": 4630 + }, + { + "ce_loss": 0.289509117603302, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.45211362838745117, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.1917085349559784, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 0.9867453575134277, + "step": 4630 + }, + { + "ce_loss": 0.3251807689666748, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.46963727474212646, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.18415512144565582, + "step": 4630 + }, + { + "epoch": 1.5476984656437625, + "loss": 1.1549, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "grad_norm": 1.5950928926467896, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "learning_rate": 0.0007677916961240245, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.9006496667861938, + "step": 4640 + }, + { + "ce_loss": 0.29129329323768616, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.4002491235733032, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.20863546431064606, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 1.0415515899658203, + "step": 4640 + }, + { + "ce_loss": 0.2967177927494049, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.46538567543029785, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.20359046757221222, + "step": 4640 + }, + { + "epoch": 1.5510340226817878, + "loss": 1.1216, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "grad_norm": 1.950021505355835, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "learning_rate": 0.0007676215941568166, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.9957625865936279, + "step": 4650 + }, + { + "ce_loss": 0.2855859696865082, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.44514599442481995, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.18354029953479767, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 1.527466893196106, + "step": 4650 + }, + { + "ce_loss": 0.34161245822906494, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.4497413635253906, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.27534714341163635, + "step": 4650 + }, + { + "epoch": 1.5543695797198132, + "loss": 1.1575, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "grad_norm": 1.8270336389541626, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "learning_rate": 0.0007674510631356506, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.9707221388816833, + "step": 4660 + }, + { + "ce_loss": 0.2743615508079529, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.48221203684806824, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.15772660076618195, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 1.6755247116088867, + "step": 4660 + }, + { + "ce_loss": 0.4013010561466217, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.5076295137405396, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.2690885663032532, + "step": 4660 + }, + { + "epoch": 1.5577051367578385, + "loss": 1.3004, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "grad_norm": 1.6369752883911133, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "learning_rate": 0.0007672801032595547, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 1.2731767892837524, + "step": 4670 + }, + { + "ce_loss": 0.33239999413490295, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.514566957950592, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.22566305100917816, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 1.146661400794983, + "step": 4670 + }, + { + "ce_loss": 0.26267436146736145, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.4635869264602661, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.21978121995925903, + "step": 4670 + }, + { + "epoch": 1.5610406937958639, + "loss": 1.2349, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "grad_norm": 1.743242859840393, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "learning_rate": 0.0007671087147280572, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 1.3634655475616455, + "step": 4680 + }, + { + "ce_loss": 0.36991485953330994, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.5084767937660217, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.26672112941741943, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 1.1525386571884155, + "step": 4680 + }, + { + "ce_loss": 0.35368019342422485, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.4598918855190277, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.23088416457176208, + "step": 4680 + }, + { + "epoch": 1.5643762508338894, + "loss": 1.223, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "grad_norm": 1.8734859228134155, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "learning_rate": 0.0007669368977411871, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 1.6650550365447998, + "step": 4690 + }, + { + "ce_loss": 0.39589375257492065, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.5713174343109131, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.311811238527298, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 1.2223482131958008, + "step": 4690 + }, + { + "ce_loss": 0.389156311750412, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.4965554177761078, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.27140820026397705, + "step": 4690 + }, + { + "epoch": 1.5677118078719148, + "loss": 1.2665, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "grad_norm": 2.6322362422943115, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "learning_rate": 0.0007667646524994734, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 1.1102492809295654, + "step": 4700 + }, + { + "ce_loss": 0.31784892082214355, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.4729698598384857, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.25654831528663635, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.9936020374298096, + "step": 4700 + }, + { + "ce_loss": 0.30014559626579285, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.41301512718200684, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.21143920719623566, + "step": 4700 + }, + { + "epoch": 1.5710473649099401, + "loss": 1.2849, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "grad_norm": 2.960231304168701, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "learning_rate": 0.0007665919792039447, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 1.1995261907577515, + "step": 4710 + }, + { + "ce_loss": 0.29931750893592834, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.4858446717262268, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.20732131600379944, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 1.3857035636901855, + "step": 4710 + }, + { + "ce_loss": 0.35007715225219727, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.49830907583236694, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.2315230369567871, + "step": 4710 + }, + { + "epoch": 1.5743829219479655, + "loss": 1.1131, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "grad_norm": 2.1521153450012207, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "learning_rate": 0.0007664188780561292, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 1.097166657447815, + "step": 4720 + }, + { + "ce_loss": 0.3242321312427521, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.47403573989868164, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.23136167228221893, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 1.1047563552856445, + "step": 4720 + }, + { + "ce_loss": 0.3445529639720917, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.4420076012611389, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.2560592293739319, + "step": 4720 + }, + { + "epoch": 1.5777184789859908, + "loss": 1.2472, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "grad_norm": 1.5923607349395752, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "learning_rate": 0.0007662453492580548, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 1.2345513105392456, + "step": 4730 + }, + { + "ce_loss": 0.3786817789077759, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.42504045367240906, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.23074667155742645, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 1.123155951499939, + "step": 4730 + }, + { + "ce_loss": 0.37243664264678955, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.432323157787323, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.24366344511508942, + "step": 4730 + }, + { + "epoch": 1.5810540360240162, + "loss": 1.198, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "grad_norm": 2.146322727203369, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "learning_rate": 0.0007660713930122482, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 1.0306843519210815, + "step": 4740 + }, + { + "ce_loss": 0.2571704089641571, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.4897782802581787, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.18662314116954803, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 1.0882079601287842, + "step": 4740 + }, + { + "ce_loss": 0.29918238520622253, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.5254595875740051, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.17846040427684784, + "step": 4740 + }, + { + "epoch": 1.5843895930620415, + "loss": 1.1687, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "grad_norm": 1.7574557065963745, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "learning_rate": 0.0007658970095217349, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 1.7387831211090088, + "step": 4750 + }, + { + "ce_loss": 0.3886146545410156, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.5086261630058289, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.24735936522483826, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 1.527781367301941, + "step": 4750 + }, + { + "ce_loss": 0.35894984006881714, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.5893746614456177, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.2735004127025604, + "step": 4750 + }, + { + "epoch": 1.5877251501000669, + "loss": 1.3334, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "grad_norm": 1.800559401512146, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "learning_rate": 0.0007657221989900394, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 1.137058138847351, + "step": 4760 + }, + { + "ce_loss": 0.33247408270835876, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.4471021592617035, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.2743755877017975, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 1.2338804006576538, + "step": 4760 + }, + { + "ce_loss": 0.30527469515800476, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.43496543169021606, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.23000425100326538, + "step": 4760 + }, + { + "epoch": 1.5910607071380922, + "loss": 1.158, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "grad_norm": 1.7711436748504639, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "learning_rate": 0.0007655469616211845, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 1.3238584995269775, + "step": 4770 + }, + { + "ce_loss": 0.4081491529941559, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.5282933115959167, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.28045764565467834, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 1.1987744569778442, + "step": 4770 + }, + { + "ce_loss": 0.25979167222976685, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.4336746335029602, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.2212361842393875, + "step": 4770 + }, + { + "epoch": 1.5943962641761176, + "loss": 1.1415, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "grad_norm": 1.8768396377563477, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "learning_rate": 0.0007653712976196909, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 1.8086743354797363, + "step": 4780 + }, + { + "ce_loss": 0.27868327498435974, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.4092644155025482, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.17147424817085266, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 0.9440048336982727, + "step": 4780 + }, + { + "ce_loss": 0.29341205954551697, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.4141649305820465, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.23555238544940948, + "step": 4780 + }, + { + "epoch": 1.597731821214143, + "loss": 1.2361, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "grad_norm": 1.5691404342651367, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "learning_rate": 0.0007651952071905772, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 1.4893643856048584, + "step": 4790 + }, + { + "ce_loss": 0.2698959708213806, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.5141409039497375, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.190774604678154, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 1.882727026939392, + "step": 4790 + }, + { + "ce_loss": 0.283547043800354, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.4522022306919098, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.259072482585907, + "step": 4790 + }, + { + "epoch": 1.6010673782521683, + "loss": 1.2438, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "grad_norm": 1.6348069906234741, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "learning_rate": 0.0007650186905393602, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 1.0375322103500366, + "step": 4800 + }, + { + "ce_loss": 0.3192451298236847, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.4521045684814453, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.2652686834335327, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.933064877986908, + "step": 4800 + }, + { + "ce_loss": 0.29700523614883423, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.4245816171169281, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.21075178682804108, + "step": 4800 + }, + { + "epoch": 1.6044029352901936, + "loss": 1.1014, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "grad_norm": 2.032665252685547, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "learning_rate": 0.0007648417478720537, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 1.0235782861709595, + "step": 4810 + }, + { + "ce_loss": 0.284810334444046, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.3628327250480652, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.19773633778095245, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 1.0101426839828491, + "step": 4810 + }, + { + "ce_loss": 0.328506737947464, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.41719162464141846, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.23638367652893066, + "step": 4810 + }, + { + "epoch": 1.607738492328219, + "loss": 1.2109, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "grad_norm": 2.2429139614105225, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "learning_rate": 0.0007646643793951688, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 1.043318510055542, + "step": 4820 + }, + { + "ce_loss": 0.30978885293006897, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.47267794609069824, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.20902498066425323, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 1.0189939737319946, + "step": 4820 + }, + { + "ce_loss": 0.33055123686790466, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.4063712954521179, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.20739997923374176, + "step": 4820 + }, + { + "epoch": 1.6110740493662443, + "loss": 1.2053, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "grad_norm": 1.7878544330596924, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "learning_rate": 0.0007644865853157135, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 1.2280665636062622, + "step": 4830 + }, + { + "ce_loss": 0.29879072308540344, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.6149335503578186, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.22780798375606537, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 1.250707983970642, + "step": 4830 + }, + { + "ce_loss": 0.3582846522331238, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.5356194972991943, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.29270094633102417, + "step": 4830 + }, + { + "epoch": 1.6144096064042697, + "loss": 1.1639, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "grad_norm": 2.3316078186035156, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "learning_rate": 0.0007643083658411931, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 1.0784870386123657, + "step": 4840 + }, + { + "ce_loss": 0.24972647428512573, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.4576243460178375, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.18615812063217163, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 1.7174890041351318, + "step": 4840 + }, + { + "ce_loss": 0.3938414454460144, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.3543025553226471, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.2732624113559723, + "step": 4840 + }, + { + "epoch": 1.617745163442295, + "loss": 1.1165, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "grad_norm": 2.677441358566284, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "learning_rate": 0.0007641297211796083, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 1.179903507232666, + "step": 4850 + }, + { + "ce_loss": 0.26890891790390015, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.37936243414878845, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.18884652853012085, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 1.218037486076355, + "step": 4850 + }, + { + "ce_loss": 0.376087486743927, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.4474377930164337, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.23854652047157288, + "step": 4850 + }, + { + "epoch": 1.6210807204803204, + "loss": 1.1608, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "grad_norm": 1.8439033031463623, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "learning_rate": 0.000763950651539457, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 1.029344081878662, + "step": 4860 + }, + { + "ce_loss": 0.32084864377975464, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.4226093292236328, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.21213941276073456, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 1.2948123216629028, + "step": 4860 + }, + { + "ce_loss": 0.35812410712242126, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.443713903427124, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.2382977157831192, + "step": 4860 + }, + { + "epoch": 1.6244162775183457, + "loss": 1.1872, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "grad_norm": 1.9559777975082397, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "learning_rate": 0.0007637711571297326, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 1.845041036605835, + "step": 4870 + }, + { + "ce_loss": 0.3581797182559967, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.5245206356048584, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.2347164750099182, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 1.1421406269073486, + "step": 4870 + }, + { + "ce_loss": 0.3097943067550659, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.4998328685760498, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.24155227839946747, + "step": 4870 + }, + { + "epoch": 1.627751834556371, + "loss": 1.2329, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "grad_norm": 2.061876058578491, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "learning_rate": 0.0007635912381599244, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 1.2822391986846924, + "step": 4880 + }, + { + "ce_loss": 0.33181074261665344, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.557192325592041, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.2948383092880249, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 1.1364002227783203, + "step": 4880 + }, + { + "ce_loss": 0.3164156377315521, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.5235283374786377, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.2043505609035492, + "step": 4880 + }, + { + "epoch": 1.6310873915943964, + "loss": 1.2234, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "grad_norm": 1.7999014854431152, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "learning_rate": 0.0007634108948400174, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 1.3725650310516357, + "step": 4890 + }, + { + "ce_loss": 0.34826570749282837, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.44967207312583923, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.2176557332277298, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 1.3029837608337402, + "step": 4890 + }, + { + "ce_loss": 0.3555228114128113, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.5428327322006226, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.22554075717926025, + "step": 4890 + }, + { + "epoch": 1.6344229486324218, + "loss": 1.2092, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "grad_norm": 1.869430422782898, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "learning_rate": 0.0007632301273804913, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 1.1500366926193237, + "step": 4900 + }, + { + "ce_loss": 0.36326518654823303, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.5155556797981262, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.20150108635425568, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 1.1529258489608765, + "step": 4900 + }, + { + "ce_loss": 0.29777246713638306, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.4624185264110565, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.21998101472854614, + "step": 4900 + }, + { + "epoch": 1.6377585056704471, + "loss": 1.169, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "grad_norm": 1.7126103639602661, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "learning_rate": 0.0007630489359923214, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 1.02724289894104, + "step": 4910 + }, + { + "ce_loss": 0.31032949686050415, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.4123621881008148, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.2039308249950409, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 1.234750747680664, + "step": 4910 + }, + { + "ce_loss": 0.3182434141635895, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.4671993851661682, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.2012316882610321, + "step": 4910 + }, + { + "epoch": 1.6410940627084725, + "loss": 1.2431, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "grad_norm": 2.990509271621704, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "learning_rate": 0.0007628673208869777, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 1.2088675498962402, + "step": 4920 + }, + { + "ce_loss": 0.34323716163635254, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.514014720916748, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.2605898678302765, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 1.0380889177322388, + "step": 4920 + }, + { + "ce_loss": 0.3201424181461334, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.5008715987205505, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.21638834476470947, + "step": 4920 + }, + { + "epoch": 1.6444296197464978, + "loss": 1.231, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "grad_norm": 1.714496374130249, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "learning_rate": 0.0007626852822764242, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 1.2255754470825195, + "step": 4930 + }, + { + "ce_loss": 0.2979726195335388, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.463115394115448, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.24635154008865356, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 1.2807600498199463, + "step": 4930 + }, + { + "ce_loss": 0.35597312450408936, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.5844821929931641, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.23208926618099213, + "step": 4930 + }, + { + "epoch": 1.6477651767845232, + "loss": 1.2145, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "grad_norm": 3.78121018409729, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "learning_rate": 0.0007625028203731197, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 1.1447784900665283, + "step": 4940 + }, + { + "ce_loss": 0.3200160264968872, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.50001060962677, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.238821342587471, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.9587218165397644, + "step": 4940 + }, + { + "ce_loss": 0.2661581039428711, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.4619063436985016, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.2296360731124878, + "step": 4940 + }, + { + "epoch": 1.6511007338225485, + "loss": 1.1663, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "grad_norm": 1.8753292560577393, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "learning_rate": 0.000762319935390017, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 1.0324651002883911, + "step": 4950 + }, + { + "ce_loss": 0.2743676006793976, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.45579105615615845, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.24305647611618042, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 1.320138692855835, + "step": 4950 + }, + { + "ce_loss": 0.30213528871536255, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.5041840076446533, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.2646632790565491, + "step": 4950 + }, + { + "epoch": 1.6544362908605739, + "loss": 1.1035, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "grad_norm": 1.4800277948379517, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "learning_rate": 0.0007621366275405624, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 1.3182570934295654, + "step": 4960 + }, + { + "ce_loss": 0.32608532905578613, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.45269620418548584, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.33297863602638245, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 1.1951959133148193, + "step": 4960 + }, + { + "ce_loss": 0.3463919460773468, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.4839607775211334, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.2843335270881653, + "step": 4960 + }, + { + "epoch": 1.6577718478985992, + "loss": 1.1683, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "grad_norm": 1.8846877813339233, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "learning_rate": 0.000761952897038696, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.9753700494766235, + "step": 4970 + }, + { + "ce_loss": 0.3265887498855591, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.40698859095573425, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.24157138168811798, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 1.1718640327453613, + "step": 4970 + }, + { + "ce_loss": 0.30149784684181213, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.4266360402107239, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.1909862905740738, + "step": 4970 + }, + { + "epoch": 1.6611074049366246, + "loss": 1.0753, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "grad_norm": 1.8321624994277954, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "learning_rate": 0.000761768744098851, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 1.1958895921707153, + "step": 4980 + }, + { + "ce_loss": 0.3138892352581024, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.4766066074371338, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.19226688146591187, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 1.039772868156433, + "step": 4980 + }, + { + "ce_loss": 0.34119418263435364, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.46169552206993103, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.2307923138141632, + "step": 4980 + }, + { + "epoch": 1.66444296197465, + "loss": 1.1579, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "grad_norm": 1.930245041847229, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "learning_rate": 0.0007615841689359537, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 1.1153697967529297, + "step": 4990 + }, + { + "ce_loss": 0.33053624629974365, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.47470927238464355, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.23434945940971375, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 0.9403101801872253, + "step": 4990 + }, + { + "ce_loss": 0.22530633211135864, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.45416656136512756, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.16067540645599365, + "step": 4990 + }, + { + "epoch": 1.6677785190126753, + "loss": 1.1993, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "grad_norm": 1.5704511404037476, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "learning_rate": 0.0007613991717654232, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 1.2045124769210815, + "step": 5000 + }, + { + "ce_loss": 0.2704031467437744, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.4578080177307129, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.2710844874382019, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 1.0595126152038574, + "step": 5000 + }, + { + "ce_loss": 0.30270177125930786, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.4562907814979553, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.24846094846725464, + "step": 5000 + }, + { + "epoch": 1.6711140760507006, + "loss": 1.2138, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "grad_norm": 1.6759746074676514, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "learning_rate": 0.0007612137528031712, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 1.137813925743103, + "step": 5010 + }, + { + "ce_loss": 0.34316080808639526, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.5127457976341248, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.21678480505943298, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 1.0579105615615845, + "step": 5010 + }, + { + "ce_loss": 0.23692962527275085, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.4204230010509491, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.21300899982452393, + "step": 5010 + }, + { + "epoch": 1.674449633088726, + "loss": 1.16, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "grad_norm": 1.626281976699829, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "learning_rate": 0.0007610279122656013, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 1.0852789878845215, + "step": 5020 + }, + { + "ce_loss": 0.3337577283382416, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.408643901348114, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.2511367201805115, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 1.2534074783325195, + "step": 5020 + }, + { + "ce_loss": 0.30116984248161316, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.5019400119781494, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.1927444487810135, + "step": 5020 + }, + { + "epoch": 1.6777851901267513, + "loss": 1.223, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "grad_norm": 1.8005539178848267, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "learning_rate": 0.0007608416503696096, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 1.5476301908493042, + "step": 5030 + }, + { + "ce_loss": 0.4201894700527191, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.5770529508590698, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.26863935589790344, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 1.5807050466537476, + "step": 5030 + }, + { + "ce_loss": 0.3773467242717743, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.44635558128356934, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.31287744641304016, + "step": 5030 + }, + { + "epoch": 1.6811207471647767, + "loss": 1.2613, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "grad_norm": 1.4061245918273926, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "learning_rate": 0.0007606549673325838, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 1.2903411388397217, + "step": 5040 + }, + { + "ce_loss": 0.2738383710384369, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.4529511630535126, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.23904402554035187, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.8651126027107239, + "step": 5040 + }, + { + "ce_loss": 0.22598311305046082, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.38961902260780334, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.21298851072788239, + "step": 5040 + }, + { + "epoch": 1.684456304202802, + "loss": 1.1926, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "grad_norm": 2.3637235164642334, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "learning_rate": 0.000760467863372403, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 0.7955895662307739, + "step": 5050 + }, + { + "ce_loss": 0.2268964648246765, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.34705016016960144, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.2212955206632614, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 1.1423490047454834, + "step": 5050 + }, + { + "ce_loss": 0.30988848209381104, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.44475507736206055, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.30063945055007935, + "step": 5050 + }, + { + "epoch": 1.6877918612408274, + "loss": 1.1629, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "grad_norm": 1.8391002416610718, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "learning_rate": 0.0007602803387074378, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 1.0280489921569824, + "step": 5060 + }, + { + "ce_loss": 0.31085681915283203, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.48093947768211365, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.2359740436077118, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 1.0623470544815063, + "step": 5060 + }, + { + "ce_loss": 0.3287794589996338, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.4346166253089905, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.2352973371744156, + "step": 5060 + }, + { + "epoch": 1.6911274182788527, + "loss": 1.1754, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "grad_norm": 1.6396242380142212, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "learning_rate": 0.0007600923935565494, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 1.1601217985153198, + "step": 5070 + }, + { + "ce_loss": 0.308447927236557, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.5223665237426758, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.21539926528930664, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 1.4861093759536743, + "step": 5070 + }, + { + "ce_loss": 0.3695334196090698, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.5409172773361206, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.21675078570842743, + "step": 5070 + }, + { + "epoch": 1.694462975316878, + "loss": 1.2464, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "grad_norm": 1.831031084060669, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "learning_rate": 0.0007599040281390903, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 1.2833508253097534, + "step": 5080 + }, + { + "ce_loss": 0.25858020782470703, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.5383179187774658, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.25638070702552795, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 1.1427491903305054, + "step": 5080 + }, + { + "ce_loss": 0.35265713930130005, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.47813865542411804, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.24728532135486603, + "step": 5080 + }, + { + "epoch": 1.6977985323549034, + "loss": 1.2015, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "grad_norm": 2.529123067855835, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "learning_rate": 0.0007597152426749031, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 1.0560879707336426, + "step": 5090 + }, + { + "ce_loss": 0.2905693054199219, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.43988847732543945, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.2370666116476059, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 1.214661955833435, + "step": 5090 + }, + { + "ce_loss": 0.36676928400993347, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.5318382382392883, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.2559376657009125, + "step": 5090 + }, + { + "epoch": 1.7011340893929288, + "loss": 1.213, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "grad_norm": 1.7899972200393677, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "learning_rate": 0.0007595260373843205, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 1.218098759651184, + "step": 5100 + }, + { + "ce_loss": 0.32043206691741943, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.5054154992103577, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.22206301987171173, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 1.1606087684631348, + "step": 5100 + }, + { + "ce_loss": 0.31909579038619995, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.49823999404907227, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.2567177712917328, + "step": 5100 + }, + { + "epoch": 1.704469646430954, + "loss": 1.1741, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "grad_norm": 1.9370585680007935, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "learning_rate": 0.0007593364124881659, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 1.0405614376068115, + "step": 5110 + }, + { + "ce_loss": 0.3165607154369354, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.47016096115112305, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.2531571686267853, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 1.1778936386108398, + "step": 5110 + }, + { + "ce_loss": 0.32279154658317566, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.5456894040107727, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.20104186236858368, + "step": 5110 + }, + { + "epoch": 1.7078052034689795, + "loss": 1.2278, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "grad_norm": 2.5454940795898438, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "learning_rate": 0.0007591463682077518, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.8619414567947388, + "step": 5120 + }, + { + "ce_loss": 0.21449235081672668, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.39205607771873474, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.1672857403755188, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 1.0508708953857422, + "step": 5120 + }, + { + "ce_loss": 0.2673969864845276, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.49973276257514954, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.19507500529289246, + "step": 5120 + }, + { + "epoch": 1.7111407605070048, + "loss": 1.2024, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "grad_norm": 1.88835608959198, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "learning_rate": 0.0007589559047648801, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 1.2697545289993286, + "step": 5130 + }, + { + "ce_loss": 0.3176894783973694, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.5580767393112183, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.18888889253139496, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 1.4602257013320923, + "step": 5130 + }, + { + "ce_loss": 0.43063437938690186, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.4556039273738861, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.3002569377422333, + "step": 5130 + }, + { + "epoch": 1.7144763175450302, + "loss": 1.1297, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "grad_norm": 1.6927906274795532, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "learning_rate": 0.0007587650223818422, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 1.0018633604049683, + "step": 5140 + }, + { + "ce_loss": 0.2813844382762909, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.38585925102233887, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.17963579297065735, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.9852960705757141, + "step": 5140 + }, + { + "ce_loss": 0.251751184463501, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.37681418657302856, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.20930209755897522, + "step": 5140 + }, + { + "epoch": 1.7178118745830555, + "loss": 1.1583, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "grad_norm": 1.387619972229004, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "learning_rate": 0.0007585737212814186, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 1.509414553642273, + "step": 5150 + }, + { + "ce_loss": 0.37400636076927185, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.4108448028564453, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.24562181532382965, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 1.2409241199493408, + "step": 5150 + }, + { + "ce_loss": 0.322540283203125, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.46941179037094116, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.23541799187660217, + "step": 5150 + }, + { + "epoch": 1.7211474316210809, + "loss": 1.2913, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "grad_norm": 2.25449275970459, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "learning_rate": 0.0007583820016868781, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.9478521943092346, + "step": 5160 + }, + { + "ce_loss": 0.30188632011413574, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.4439525604248047, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.20185263454914093, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 1.008941650390625, + "step": 5160 + }, + { + "ce_loss": 0.2736072838306427, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.42032089829444885, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.14703622460365295, + "step": 5160 + }, + { + "epoch": 1.7244829886591062, + "loss": 1.2241, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "grad_norm": 1.9468799829483032, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "learning_rate": 0.0007581898638219782, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 1.4961808919906616, + "step": 5170 + }, + { + "ce_loss": 0.28084325790405273, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.4227789640426636, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.24790780246257782, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 1.213461995124817, + "step": 5170 + }, + { + "ce_loss": 0.30838051438331604, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.4545673727989197, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.20522762835025787, + "step": 5170 + }, + { + "epoch": 1.7278185456971316, + "loss": 1.1363, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "grad_norm": 1.6751428842544556, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "learning_rate": 0.0007579973079109644, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 1.1813774108886719, + "step": 5180 + }, + { + "ce_loss": 0.34286630153656006, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.5524317026138306, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.28594598174095154, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.9992787837982178, + "step": 5180 + }, + { + "ce_loss": 0.24412690103054047, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.4426809847354889, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.24817222356796265, + "step": 5180 + }, + { + "epoch": 1.731154102735157, + "loss": 1.1503, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "grad_norm": 1.4612666368484497, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "learning_rate": 0.0007578043341785701, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 1.0483276844024658, + "step": 5190 + }, + { + "ce_loss": 0.3545515835285187, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.42343053221702576, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.22489754855632782, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 0.7794271111488342, + "step": 5190 + }, + { + "ce_loss": 0.23218852281570435, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.34502241015434265, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.2018437385559082, + "step": 5190 + }, + { + "epoch": 1.7344896597731823, + "loss": 1.2091, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "grad_norm": 2.8298192024230957, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "learning_rate": 0.0007576109428500164, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 1.2747900485992432, + "step": 5200 + }, + { + "ce_loss": 0.2895369529724121, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.47683602571487427, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.27042490243911743, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 1.313528060913086, + "step": 5200 + }, + { + "ce_loss": 0.3060389757156372, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.47130000591278076, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.24032670259475708, + "step": 5200 + }, + { + "epoch": 1.7378252168112076, + "loss": 1.1724, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "grad_norm": 2.6083362102508545, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "learning_rate": 0.0007574171341510119, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 1.0935707092285156, + "step": 5210 + }, + { + "ce_loss": 0.22755658626556396, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.46089592576026917, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.1798480749130249, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 1.0753840208053589, + "step": 5210 + }, + { + "ce_loss": 0.31017637252807617, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.44865307211875916, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.20125728845596313, + "step": 5210 + }, + { + "epoch": 1.741160773849233, + "loss": 1.1641, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "grad_norm": 2.240241050720215, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "learning_rate": 0.0007572229083077524, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 1.3574274778366089, + "step": 5220 + }, + { + "ce_loss": 0.3337510824203491, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.5477548837661743, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.25798723101615906, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 1.0125255584716797, + "step": 5220 + }, + { + "ce_loss": 0.32500120997428894, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.43379876017570496, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.25346094369888306, + "step": 5220 + }, + { + "epoch": 1.7444963308872583, + "loss": 1.2409, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "grad_norm": 2.328303575515747, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "learning_rate": 0.0007570282655469198, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 1.5247856378555298, + "step": 5230 + }, + { + "ce_loss": 0.32193514704704285, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.4718524217605591, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.25647538900375366, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 1.5344735383987427, + "step": 5230 + }, + { + "ce_loss": 0.37541303038597107, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.47761592268943787, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.19708700478076935, + "step": 5230 + }, + { + "epoch": 1.7478318879252837, + "loss": 1.2756, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "grad_norm": 1.7134599685668945, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "learning_rate": 0.0007568332060956836, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 1.1749379634857178, + "step": 5240 + }, + { + "ce_loss": 0.31640517711639404, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.45369449257850647, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.22938039898872375, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 1.536935806274414, + "step": 5240 + }, + { + "ce_loss": 0.3823091983795166, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.533094048500061, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.25532063841819763, + "step": 5240 + }, + { + "epoch": 1.751167444963309, + "loss": 1.2075, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "grad_norm": 1.5515145063400269, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "learning_rate": 0.0007566377301816992, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 1.1134408712387085, + "step": 5250 + }, + { + "ce_loss": 0.28996923565864563, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.5056423544883728, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.22649165987968445, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 1.2687480449676514, + "step": 5250 + }, + { + "ce_loss": 0.38387155532836914, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.5104761719703674, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.2832685112953186, + "step": 5250 + }, + { + "epoch": 1.7545030020013344, + "loss": 1.1648, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "grad_norm": 2.0206077098846436, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "learning_rate": 0.0007564418380331077, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.9798883199691772, + "step": 5260 + }, + { + "ce_loss": 0.25137314200401306, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.44658365845680237, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.2106369584798813, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 1.2164933681488037, + "step": 5260 + }, + { + "ce_loss": 0.3413506746292114, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.4681362211704254, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.2433163821697235, + "step": 5260 + }, + { + "epoch": 1.7578385590393597, + "loss": 1.1118, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "grad_norm": 1.2726492881774902, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "learning_rate": 0.0007562455298785365, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.9748777747154236, + "step": 5270 + }, + { + "ce_loss": 0.2741670608520508, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.4492979049682617, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.20096157491207123, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.9771814346313477, + "step": 5270 + }, + { + "ce_loss": 0.3123237192630768, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.3900871276855469, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.1981293261051178, + "step": 5270 + }, + { + "epoch": 1.761174116077385, + "loss": 1.1798, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "grad_norm": 1.7693536281585693, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "learning_rate": 0.0007560488059470984, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 1.1466625928878784, + "step": 5280 + }, + { + "ce_loss": 0.28282761573791504, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.506600022315979, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.19416525959968567, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 1.3348321914672852, + "step": 5280 + }, + { + "ce_loss": 0.40944698452949524, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.5854804515838623, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.27034467458724976, + "step": 5280 + }, + { + "epoch": 1.7645096731154104, + "loss": 1.2163, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "grad_norm": 3.366854429244995, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "learning_rate": 0.0007558516664683913, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 1.089564323425293, + "step": 5290 + }, + { + "ce_loss": 0.3452341854572296, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.512012779712677, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.23154222965240479, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 1.4174798727035522, + "step": 5290 + }, + { + "ce_loss": 0.39411184191703796, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.5689647793769836, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.25486916303634644, + "step": 5290 + }, + { + "epoch": 1.7678452301534358, + "loss": 1.085, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "grad_norm": 2.4952659606933594, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "learning_rate": 0.0007556541116724981, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 1.0137168169021606, + "step": 5300 + }, + { + "ce_loss": 0.26594722270965576, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.41260483860969543, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.1979988068342209, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 1.5348089933395386, + "step": 5300 + }, + { + "ce_loss": 0.4159756898880005, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.5758260488510132, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.29201629757881165, + "step": 5300 + }, + { + "epoch": 1.771180787191461, + "loss": 1.1911, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "grad_norm": 2.0186476707458496, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "learning_rate": 0.0007554561417899867, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 1.5441590547561646, + "step": 5310 + }, + { + "ce_loss": 0.29326045513153076, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.47031792998313904, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.2508998215198517, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 1.181255578994751, + "step": 5310 + }, + { + "ce_loss": 0.3429895043373108, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.6115217804908752, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.22667163610458374, + "step": 5310 + }, + { + "epoch": 1.7745163442294865, + "loss": 1.2429, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "grad_norm": 1.53067147731781, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "learning_rate": 0.0007552577570519092, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 1.9128655195236206, + "step": 5320 + }, + { + "ce_loss": 0.47747403383255005, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.5113193988800049, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.3745881915092468, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 1.1742745637893677, + "step": 5320 + }, + { + "ce_loss": 0.3044947683811188, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.4169827997684479, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.21569328010082245, + "step": 5320 + }, + { + "epoch": 1.7778519012675118, + "loss": 1.2755, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "grad_norm": 2.012723922729492, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "learning_rate": 0.0007550589576898018, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 1.4698947668075562, + "step": 5330 + }, + { + "ce_loss": 0.35279497504234314, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.5262315273284912, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.2742978036403656, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 0.9402695298194885, + "step": 5330 + }, + { + "ce_loss": 0.22868719696998596, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.3957265317440033, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.16549795866012573, + "step": 5330 + }, + { + "epoch": 1.7811874583055372, + "loss": 1.1196, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "grad_norm": 1.567074179649353, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "learning_rate": 0.000754859743935685, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 1.668684720993042, + "step": 5340 + }, + { + "ce_loss": 0.38102585077285767, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.4782260060310364, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.3241592049598694, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 0.749116063117981, + "step": 5340 + }, + { + "ce_loss": 0.20039691030979156, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.33525410294532776, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.16110847890377045, + "step": 5340 + }, + { + "epoch": 1.7845230153435625, + "loss": 1.0913, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "grad_norm": 1.7558014392852783, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "learning_rate": 0.0007546601160220624, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 1.0668269395828247, + "step": 5350 + }, + { + "ce_loss": 0.26166832447052, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.4416963756084442, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.18618950247764587, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 1.3767130374908447, + "step": 5350 + }, + { + "ce_loss": 0.4135306477546692, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.4508998990058899, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.1901094913482666, + "step": 5350 + }, + { + "epoch": 1.7878585723815879, + "loss": 1.1026, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "grad_norm": 1.6343917846679688, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "learning_rate": 0.0007544600741819213, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 0.9110533595085144, + "step": 5360 + }, + { + "ce_loss": 0.30580198764801025, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.3742898106575012, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.23055045306682587, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 1.753638505935669, + "step": 5360 + }, + { + "ce_loss": 0.3236137628555298, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.38308921456336975, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.2120286375284195, + "step": 5360 + }, + { + "epoch": 1.7911941294196132, + "loss": 1.1418, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "grad_norm": 1.9476059675216675, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "learning_rate": 0.0007542596186487324, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 1.5973538160324097, + "step": 5370 + }, + { + "ce_loss": 0.3987217843532562, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.5269778966903687, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.32627880573272705, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 1.2063021659851074, + "step": 5370 + }, + { + "ce_loss": 0.31812429428100586, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.4934598207473755, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.2113611102104187, + "step": 5370 + }, + { + "epoch": 1.7945296864576386, + "loss": 1.1803, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "grad_norm": 2.2353322505950928, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "learning_rate": 0.0007540587496564484, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.9624265432357788, + "step": 5380 + }, + { + "ce_loss": 0.2751317620277405, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.4234001338481903, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.19405362010002136, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 1.2300310134887695, + "step": 5380 + }, + { + "ce_loss": 0.2887333631515503, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.4319232702255249, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.21429966390132904, + "step": 5380 + }, + { + "epoch": 1.797865243495664, + "loss": 1.2103, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "grad_norm": 2.053229808807373, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "learning_rate": 0.0007538574674395054, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 0.9388774037361145, + "step": 5390 + }, + { + "ce_loss": 0.22611574828624725, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.4997987151145935, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.2127380520105362, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 1.4807536602020264, + "step": 5390 + }, + { + "ce_loss": 0.41299211978912354, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.5142659544944763, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.24807098507881165, + "step": 5390 + }, + { + "epoch": 1.8012008005336893, + "loss": 1.1674, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "grad_norm": 2.382713556289673, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "learning_rate": 0.0007536557722328214, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 1.3658170700073242, + "step": 5400 + }, + { + "ce_loss": 0.2759830355644226, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.515870988368988, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.1930103451013565, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 1.4405559301376343, + "step": 5400 + }, + { + "ce_loss": 0.34385547041893005, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.519115686416626, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.1835671365261078, + "step": 5400 + }, + { + "epoch": 1.8045363575717146, + "loss": 1.2482, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "grad_norm": 1.628182053565979, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "learning_rate": 0.0007534536642717961, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 1.438568115234375, + "step": 5410 + }, + { + "ce_loss": 0.35257962346076965, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.5803521871566772, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.2886123061180115, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 0.9762012362480164, + "step": 5410 + }, + { + "ce_loss": 0.27220970392227173, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.4245986044406891, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.2115209996700287, + "step": 5410 + }, + { + "epoch": 1.80787191460974, + "loss": 1.3043, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "grad_norm": 1.702199935913086, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "learning_rate": 0.0007532511437923113, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 1.0344901084899902, + "step": 5420 + }, + { + "ce_loss": 0.28677964210510254, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.476391077041626, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.18809427320957184, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 1.2833102941513062, + "step": 5420 + }, + { + "ce_loss": 0.3542989492416382, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.4811602532863617, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.22961384057998657, + "step": 5420 + }, + { + "epoch": 1.8112074716477653, + "loss": 1.143, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "grad_norm": 1.7151790857315063, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "learning_rate": 0.0007530482110307304, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.9452934861183167, + "step": 5430 + }, + { + "ce_loss": 0.2456946074962616, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.3025226593017578, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.22681143879890442, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 1.0299745798110962, + "step": 5430 + }, + { + "ce_loss": 0.3505493998527527, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.36607131361961365, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.2534058392047882, + "step": 5430 + }, + { + "epoch": 1.8145430286857906, + "loss": 1.0969, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "grad_norm": 2.0306644439697266, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "learning_rate": 0.0007528448662238976, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 1.2721872329711914, + "step": 5440 + }, + { + "ce_loss": 0.35148635506629944, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.5788367390632629, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.28733375668525696, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 1.2890324592590332, + "step": 5440 + }, + { + "ce_loss": 0.3618167042732239, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.5949937105178833, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.25563210248947144, + "step": 5440 + }, + { + "epoch": 1.817878585723816, + "loss": 1.1423, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "grad_norm": 1.2551982402801514, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "learning_rate": 0.0007526411096091384, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.9944010972976685, + "step": 5450 + }, + { + "ce_loss": 0.3130263090133667, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.3931999206542969, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.22332419455051422, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.8705968856811523, + "step": 5450 + }, + { + "ce_loss": 0.2662094831466675, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.36075133085250854, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.24345366656780243, + "step": 5450 + }, + { + "epoch": 1.8212141427618413, + "loss": 1.2, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "grad_norm": 2.462411642074585, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "learning_rate": 0.0007524369414242584, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.942243218421936, + "step": 5460 + }, + { + "ce_loss": 0.28354936838150024, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.41044342517852783, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.2307603508234024, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 1.3136810064315796, + "step": 5460 + }, + { + "ce_loss": 0.33640730381011963, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.5245234966278076, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.20452705025672913, + "step": 5460 + }, + { + "epoch": 1.8245496997998667, + "loss": 1.2422, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "grad_norm": 2.514112949371338, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "learning_rate": 0.000752232361907544, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 1.0976307392120361, + "step": 5470 + }, + { + "ce_loss": 0.3172653615474701, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.48672354221343994, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.2345392107963562, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 1.0066343545913696, + "step": 5470 + }, + { + "ce_loss": 0.3098627030849457, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.39110395312309265, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.22836095094680786, + "step": 5470 + }, + { + "epoch": 1.827885256837892, + "loss": 1.0856, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "grad_norm": 1.774595022201538, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "learning_rate": 0.0007520273712977616, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 1.7556549310684204, + "step": 5480 + }, + { + "ce_loss": 0.2319716066122055, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.3767503499984741, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.20208023488521576, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 1.2231870889663696, + "step": 5480 + }, + { + "ce_loss": 0.34553295373916626, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.5309174060821533, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.2443692535161972, + "step": 5480 + }, + { + "epoch": 1.8312208138759174, + "loss": 1.2356, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "grad_norm": 1.8566852807998657, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "learning_rate": 0.000751821969834157, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 1.3017749786376953, + "step": 5490 + }, + { + "ce_loss": 0.30346235632896423, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.4574112296104431, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.23760145902633667, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 1.5731134414672852, + "step": 5490 + }, + { + "ce_loss": 0.29255327582359314, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.4530612826347351, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.18628793954849243, + "step": 5490 + }, + { + "epoch": 1.8345563709139427, + "loss": 1.1548, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "grad_norm": 2.09499454498291, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "learning_rate": 0.000751616157756456, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 1.183165431022644, + "step": 5500 + }, + { + "ce_loss": 0.24258391559123993, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.44836339354515076, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.2561323642730713, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 1.9524562358856201, + "step": 5500 + }, + { + "ce_loss": 0.34489694237709045, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.49662792682647705, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.19847920536994934, + "step": 5500 + }, + { + "epoch": 1.837891927951968, + "loss": 1.1857, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "grad_norm": 2.0431957244873047, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "learning_rate": 0.0007514099353048636, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 1.0093868970870972, + "step": 5510 + }, + { + "ce_loss": 0.32575932145118713, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.4605439603328705, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.22248651087284088, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 1.0461400747299194, + "step": 5510 + }, + { + "ce_loss": 0.2589099109172821, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.5331931114196777, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.1913858950138092, + "step": 5510 + }, + { + "epoch": 1.8412274849899934, + "loss": 1.1587, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "grad_norm": 1.8402179479599, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "learning_rate": 0.0007512033027200634, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 1.1105293035507202, + "step": 5520 + }, + { + "ce_loss": 0.29747650027275085, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.45094624161720276, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.20536360144615173, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 1.1815446615219116, + "step": 5520 + }, + { + "ce_loss": 0.24095499515533447, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.50213223695755, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.20024245977401733, + "step": 5520 + }, + { + "epoch": 1.8445630420280188, + "loss": 1.1764, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "grad_norm": 1.6938358545303345, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "learning_rate": 0.0007509962602432177, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.9573712348937988, + "step": 5530 + }, + { + "ce_loss": 0.30945950746536255, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.4477289319038391, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.19660018384456635, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 1.0629284381866455, + "step": 5530 + }, + { + "ce_loss": 0.31846141815185547, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.43803030252456665, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.2342435121536255, + "step": 5530 + }, + { + "epoch": 1.8478985990660441, + "loss": 1.1684, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "grad_norm": 1.881791114807129, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "learning_rate": 0.0007507888081159678, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 1.2271531820297241, + "step": 5540 + }, + { + "ce_loss": 0.37106800079345703, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.4225543141365051, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.27660301327705383, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 1.0412670373916626, + "step": 5540 + }, + { + "ce_loss": 0.37082210183143616, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.446853369474411, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.22295041382312775, + "step": 5540 + }, + { + "epoch": 1.8512341561040695, + "loss": 1.1782, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "grad_norm": 1.8545033931732178, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "learning_rate": 0.0007505809465804321, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 1.3148798942565918, + "step": 5550 + }, + { + "ce_loss": 0.3096807599067688, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.44882887601852417, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.29201146960258484, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 1.1060969829559326, + "step": 5550 + }, + { + "ce_loss": 0.26883015036582947, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.3587776720523834, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.21522045135498047, + "step": 5550 + }, + { + "epoch": 1.8545697131420948, + "loss": 1.1401, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "grad_norm": 1.8426175117492676, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "learning_rate": 0.0007503726758792079, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 1.0726665258407593, + "step": 5560 + }, + { + "ce_loss": 0.3178630769252777, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.41456592082977295, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.270908385515213, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 1.5326629877090454, + "step": 5560 + }, + { + "ce_loss": 0.38044241070747375, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.49526867270469666, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.3099924921989441, + "step": 5560 + }, + { + "epoch": 1.8579052701801202, + "loss": 1.1707, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "grad_norm": 2.486192226409912, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "learning_rate": 0.0007501639962553691, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 1.1377767324447632, + "step": 5570 + }, + { + "ce_loss": 0.36928075551986694, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.44840309023857117, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.2513749301433563, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 1.3746496438980103, + "step": 5570 + }, + { + "ce_loss": 0.33873075246810913, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.46756061911582947, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.17396891117095947, + "step": 5570 + }, + { + "epoch": 1.8612408272181455, + "loss": 1.2129, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "grad_norm": 2.368302822113037, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "learning_rate": 0.0007499549079524677, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 1.0033069849014282, + "step": 5580 + }, + { + "ce_loss": 0.3440701961517334, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.38273337483406067, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.21641433238983154, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 1.2916123867034912, + "step": 5580 + }, + { + "ce_loss": 0.2969575524330139, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.3884190320968628, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.2365722358226776, + "step": 5580 + }, + { + "epoch": 1.864576384256171, + "loss": 1.2139, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "grad_norm": 2.768338918685913, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "learning_rate": 0.0007497454112145318, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 1.0387018918991089, + "step": 5590 + }, + { + "ce_loss": 0.28695935010910034, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.3994467258453369, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.2750386595726013, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 1.2993309497833252, + "step": 5590 + }, + { + "ce_loss": 0.22111760079860687, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.4015125334262848, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.17563439905643463, + "step": 5590 + }, + { + "epoch": 1.8679119412941962, + "loss": 1.1576, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "grad_norm": 1.3973478078842163, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "learning_rate": 0.000749535506286067, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.9982587099075317, + "step": 5600 + }, + { + "ce_loss": 0.2859624922275543, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.48541250824928284, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.22652120888233185, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 1.2003138065338135, + "step": 5600 + }, + { + "ce_loss": 0.3607683777809143, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.5441848039627075, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.23783503472805023, + "step": 5600 + }, + { + "epoch": 1.8712474983322216, + "loss": 1.1887, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "grad_norm": 2.882995128631592, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "learning_rate": 0.0007493251934120547, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 1.0685358047485352, + "step": 5610 + }, + { + "ce_loss": 0.29604318737983704, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.512268602848053, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.25988975167274475, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 1.2088600397109985, + "step": 5610 + }, + { + "ce_loss": 0.35079681873321533, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.5436410903930664, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.21157583594322205, + "step": 5610 + }, + { + "epoch": 1.874583055370247, + "loss": 1.1698, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "grad_norm": 3.201819658279419, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "learning_rate": 0.0007491144728379528, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 0.9609626531600952, + "step": 5620 + }, + { + "ce_loss": 0.23364980518817902, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.46882686018943787, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.176205113530159, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 1.9625874757766724, + "step": 5620 + }, + { + "ce_loss": 0.32893672585487366, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.5009834170341492, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.2691947817802429, + "step": 5620 + }, + { + "epoch": 1.8779186124082723, + "loss": 1.2908, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "grad_norm": 1.6491600275039673, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "learning_rate": 0.0007489033448096948, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 1.1745247840881348, + "step": 5630 + }, + { + "ce_loss": 0.2985750436782837, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.3555160462856293, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.2050323784351349, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 1.0036077499389648, + "step": 5630 + }, + { + "ce_loss": 0.3427830934524536, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.41693538427352905, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.24346625804901123, + "step": 5630 + }, + { + "epoch": 1.8812541694462976, + "loss": 1.2053, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "grad_norm": 1.826837420463562, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "learning_rate": 0.0007486918095736897, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 1.2536578178405762, + "step": 5640 + }, + { + "ce_loss": 0.2937576174736023, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.636205792427063, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.23159760236740112, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 1.3230061531066895, + "step": 5640 + }, + { + "ce_loss": 0.4143422245979309, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.5751572847366333, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.2638043761253357, + "step": 5640 + }, + { + "epoch": 1.884589726484323, + "loss": 1.1228, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "grad_norm": 1.902815580368042, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "learning_rate": 0.0007484798673768223, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 1.1458295583724976, + "step": 5650 + }, + { + "ce_loss": 0.37263768911361694, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.45414674282073975, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.271620512008667, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 0.9640642404556274, + "step": 5650 + }, + { + "ce_loss": 0.296345591545105, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.4380930960178375, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.21325789391994476, + "step": 5650 + }, + { + "epoch": 1.8879252835223483, + "loss": 1.1008, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "grad_norm": 1.9657776355743408, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "learning_rate": 0.0007482675184664516, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 1.1546496152877808, + "step": 5660 + }, + { + "ce_loss": 0.2919650077819824, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.376723051071167, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.19715668261051178, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.9053738117218018, + "step": 5660 + }, + { + "ce_loss": 0.22867360711097717, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.36551451683044434, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.1943945437669754, + "step": 5660 + }, + { + "epoch": 1.8912608405603737, + "loss": 1.1558, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "grad_norm": 1.620741605758667, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "learning_rate": 0.0007480547630904117, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 1.0649346113204956, + "step": 5670 + }, + { + "ce_loss": 0.2661072611808777, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.4255768954753876, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.2549579441547394, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.9806616902351379, + "step": 5670 + }, + { + "ce_loss": 0.27610334753990173, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.4894576668739319, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.21505948901176453, + "step": 5670 + }, + { + "epoch": 1.894596397598399, + "loss": 1.1236, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "grad_norm": 2.7246854305267334, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "learning_rate": 0.0007478416014970108, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 0.9774023294448853, + "step": 5680 + }, + { + "ce_loss": 0.2604271471500397, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.4833385646343231, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.1873892843723297, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 1.1073026657104492, + "step": 5680 + }, + { + "ce_loss": 0.3042682707309723, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.5258811116218567, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.2086445689201355, + "step": 5680 + }, + { + "epoch": 1.8979319546364244, + "loss": 1.1633, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "grad_norm": 2.3458380699157715, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "learning_rate": 0.0007476280339350319, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.8902542591094971, + "step": 5690 + }, + { + "ce_loss": 0.2166803926229477, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.42578381299972534, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.1565944254398346, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 1.0363528728485107, + "step": 5690 + }, + { + "ce_loss": 0.3460255265235901, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.46616458892822266, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.22409404814243317, + "step": 5690 + }, + { + "epoch": 1.9012675116744497, + "loss": 1.1329, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "grad_norm": 2.120044231414795, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "learning_rate": 0.0007474140606537311, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 1.5393993854522705, + "step": 5700 + }, + { + "ce_loss": 0.35393574833869934, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.5372600555419922, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.24297258257865906, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.8871737718582153, + "step": 5700 + }, + { + "ce_loss": 0.2896822690963745, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.3992535173892975, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.1980990320444107, + "step": 5700 + }, + { + "epoch": 1.904603068712475, + "loss": 1.2162, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "grad_norm": 1.8103832006454468, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "learning_rate": 0.0007471996819028382, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 1.0989534854888916, + "step": 5710 + }, + { + "ce_loss": 0.2598811388015747, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.4422469139099121, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.23823165893554688, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 1.0796765089035034, + "step": 5710 + }, + { + "ce_loss": 0.27391597628593445, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.45274975895881653, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.20589439570903778, + "step": 5710 + }, + { + "epoch": 1.9079386257505004, + "loss": 1.088, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "grad_norm": 1.6986693143844604, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "learning_rate": 0.0007469848979325562, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 0.8809690475463867, + "step": 5720 + }, + { + "ce_loss": 0.2537936568260193, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.377767950296402, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.21097536385059357, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 1.995163917541504, + "step": 5720 + }, + { + "ce_loss": 0.4546584486961365, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.6019603610038757, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.38922926783561707, + "step": 5720 + }, + { + "epoch": 1.9112741827885258, + "loss": 1.2926, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "grad_norm": 1.4185676574707031, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "learning_rate": 0.0007467697089935612, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 1.679802656173706, + "step": 5730 + }, + { + "ce_loss": 0.3139112889766693, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.4400520324707031, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.2145587056875229, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 1.0517017841339111, + "step": 5730 + }, + { + "ce_loss": 0.34958764910697937, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.39745035767555237, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.24486936628818512, + "step": 5730 + }, + { + "epoch": 1.9146097398265511, + "loss": 1.2064, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "grad_norm": 1.8157681226730347, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "learning_rate": 0.0007465541153370019, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 1.1898601055145264, + "step": 5740 + }, + { + "ce_loss": 0.28614503145217896, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.40134701132774353, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.23302209377288818, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 1.3736635446548462, + "step": 5740 + }, + { + "ce_loss": 0.33695244789123535, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.4986035227775574, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.29099375009536743, + "step": 5740 + }, + { + "epoch": 1.9179452968645765, + "loss": 1.1414, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "grad_norm": 1.8164293766021729, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "learning_rate": 0.0007463381172144992, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 1.0783435106277466, + "step": 5750 + }, + { + "ce_loss": 0.31459519267082214, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.46592262387275696, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.2333477884531021, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.9109805226325989, + "step": 5750 + }, + { + "ce_loss": 0.24675825238227844, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.3833222985267639, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.19883716106414795, + "step": 5750 + }, + { + "epoch": 1.9212808539026018, + "loss": 1.1066, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "grad_norm": 1.8867511749267578, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "learning_rate": 0.0007461217148781461, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 1.0111404657363892, + "step": 5760 + }, + { + "ce_loss": 0.2570357322692871, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.4955453872680664, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.19137851893901825, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 1.4673306941986084, + "step": 5760 + }, + { + "ce_loss": 0.40399816632270813, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.5058277249336243, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.319006085395813, + "step": 5760 + }, + { + "epoch": 1.9246164109406272, + "loss": 1.2681, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "grad_norm": 1.7314869165420532, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "learning_rate": 0.0007459049085805075, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 1.031352162361145, + "step": 5770 + }, + { + "ce_loss": 0.3140748143196106, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.47759658098220825, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.19037601351737976, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 1.2562201023101807, + "step": 5770 + }, + { + "ce_loss": 0.3546963930130005, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.569050669670105, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.2440304160118103, + "step": 5770 + }, + { + "epoch": 1.9279519679786525, + "loss": 1.1791, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "grad_norm": 1.8017394542694092, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "learning_rate": 0.0007456876985746199, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 1.1710878610610962, + "step": 5780 + }, + { + "ce_loss": 0.27451661229133606, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.5686822533607483, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.21199308335781097, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.9214657545089722, + "step": 5780 + }, + { + "ce_loss": 0.23972466588020325, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.43682003021240234, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.19800573587417603, + "step": 5780 + }, + { + "epoch": 1.9312875250166779, + "loss": 1.1405, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "grad_norm": 1.826278805732727, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "learning_rate": 0.0007454700851139903, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.8356040120124817, + "step": 5790 + }, + { + "ce_loss": 0.23331232368946075, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.38790708780288696, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.15372376143932343, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 1.0069454908370972, + "step": 5790 + }, + { + "ce_loss": 0.28266188502311707, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.360159695148468, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.22955982387065887, + "step": 5790 + }, + { + "epoch": 1.9346230820547032, + "loss": 1.1853, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "grad_norm": 2.1900691986083984, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "learning_rate": 0.0007452520684525974, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 1.210611343383789, + "step": 5800 + }, + { + "ce_loss": 0.26552486419677734, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.3750151991844177, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.21927280724048615, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 1.9376190900802612, + "step": 5800 + }, + { + "ce_loss": 0.4320092797279358, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.4518512487411499, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.2782735526561737, + "step": 5800 + }, + { + "epoch": 1.9379586390927286, + "loss": 1.2453, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "grad_norm": 2.809798002243042, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "learning_rate": 0.0007450336488448899, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 1.1590726375579834, + "step": 5810 + }, + { + "ce_loss": 0.2636038064956665, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.6099088191986084, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.18551239371299744, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 1.3410998582839966, + "step": 5810 + }, + { + "ce_loss": 0.37131693959236145, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.5650545358657837, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.1918272227048874, + "step": 5810 + }, + { + "epoch": 1.941294196130754, + "loss": 1.1365, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "grad_norm": 2.5075430870056152, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "learning_rate": 0.0007448148265457871, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 0.9481791257858276, + "step": 5820 + }, + { + "ce_loss": 0.22015568614006042, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.37985068559646606, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.19499360024929047, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 1.1051946878433228, + "step": 5820 + }, + { + "ce_loss": 0.24825023114681244, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.37336069345474243, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.23749548196792603, + "step": 5820 + }, + { + "epoch": 1.9446297531687793, + "loss": 1.1193, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "grad_norm": 1.9296715259552002, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "learning_rate": 0.000744595601810678, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 1.1018894910812378, + "step": 5830 + }, + { + "ce_loss": 0.333945631980896, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.43748483061790466, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.20613163709640503, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.9875980615615845, + "step": 5830 + }, + { + "ce_loss": 0.2940070629119873, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.3901951313018799, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.22852958738803864, + "step": 5830 + }, + { + "epoch": 1.9479653102068046, + "loss": 1.0817, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "grad_norm": 2.464088201522827, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "learning_rate": 0.0007443759748954217, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 1.3885273933410645, + "step": 5840 + }, + { + "ce_loss": 0.3946130573749542, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.4795334041118622, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.2565721869468689, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 1.389933466911316, + "step": 5840 + }, + { + "ce_loss": 0.2812231481075287, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.4569879174232483, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.22854776680469513, + "step": 5840 + }, + { + "epoch": 1.95130086724483, + "loss": 1.2102, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "grad_norm": 1.5864168405532837, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "learning_rate": 0.0007441559460563461, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 1.2205491065979004, + "step": 5850 + }, + { + "ce_loss": 0.3103644549846649, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.41929182410240173, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.2448168694972992, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 1.2417341470718384, + "step": 5850 + }, + { + "ce_loss": 0.29796847701072693, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.4613056480884552, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.19173882901668549, + "step": 5850 + }, + { + "epoch": 1.9546364242828553, + "loss": 1.3159, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "grad_norm": 2.4112977981567383, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "learning_rate": 0.0007439355155502489, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 1.291137933731079, + "step": 5860 + }, + { + "ce_loss": 0.3205377459526062, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.4666355550289154, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.23071230947971344, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 1.234226942062378, + "step": 5860 + }, + { + "ce_loss": 0.3565260171890259, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.5676701068878174, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.2438981682062149, + "step": 5860 + }, + { + "epoch": 1.9579719813208807, + "loss": 1.1518, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "grad_norm": 2.057464122772217, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "learning_rate": 0.0007437146836343961, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 1.0851755142211914, + "step": 5870 + }, + { + "ce_loss": 0.3205083906650543, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.47727417945861816, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.2165471762418747, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 1.249542474746704, + "step": 5870 + }, + { + "ce_loss": 0.2969149351119995, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.4526827335357666, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.2169814556837082, + "step": 5870 + }, + { + "epoch": 1.961307538358906, + "loss": 1.1561, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "grad_norm": 2.307563543319702, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "learning_rate": 0.0007434934505665223, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 1.1310524940490723, + "step": 5880 + }, + { + "ce_loss": 0.3645102083683014, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.4820353090763092, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.2054779976606369, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 1.1591945886611938, + "step": 5880 + }, + { + "ce_loss": 0.2980738878250122, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.39868849515914917, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.2012583613395691, + "step": 5880 + }, + { + "epoch": 1.9646430953969314, + "loss": 1.2105, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "grad_norm": 2.318319082260132, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "learning_rate": 0.0007432718166048301, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 1.6052378416061401, + "step": 5890 + }, + { + "ce_loss": 0.36234989762306213, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.526151180267334, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.29334115982055664, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 1.3489465713500977, + "step": 5890 + }, + { + "ce_loss": 0.304647833108902, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.5583611130714417, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.22426193952560425, + "step": 5890 + }, + { + "epoch": 1.9679786524349567, + "loss": 1.2969, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "grad_norm": 2.613701820373535, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "learning_rate": 0.0007430497820079903, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 1.1732020378112793, + "step": 5900 + }, + { + "ce_loss": 0.3025830388069153, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.4889051616191864, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.17684857547283173, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 1.0720723867416382, + "step": 5900 + }, + { + "ce_loss": 0.3472079932689667, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.4841811954975128, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.24045956134796143, + "step": 5900 + }, + { + "epoch": 1.971314209472982, + "loss": 1.1779, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "grad_norm": 2.5917856693267822, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "learning_rate": 0.0007428273470351414, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 1.2020492553710938, + "step": 5910 + }, + { + "ce_loss": 0.34799009561538696, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.49872174859046936, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.2092563956975937, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 1.0802489519119263, + "step": 5910 + }, + { + "ce_loss": 0.2857528030872345, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.4485069215297699, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.24490563571453094, + "step": 5910 + }, + { + "epoch": 1.9746497665110074, + "loss": 1.141, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "grad_norm": 1.7976915836334229, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "learning_rate": 0.0007426045119458886, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.9701898097991943, + "step": 5920 + }, + { + "ce_loss": 0.20789991319179535, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.3563303053379059, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.2152089774608612, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 1.2094759941101074, + "step": 5920 + }, + { + "ce_loss": 0.335591584444046, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.4801439642906189, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.23890548944473267, + "step": 5920 + }, + { + "epoch": 1.9779853235490328, + "loss": 1.1592, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "grad_norm": 1.8345674276351929, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "learning_rate": 0.0007423812770003046, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 1.3806750774383545, + "step": 5930 + }, + { + "ce_loss": 0.2798145115375519, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.43999797105789185, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.22765986621379852, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 1.184518575668335, + "step": 5930 + }, + { + "ce_loss": 0.34069424867630005, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.4194449782371521, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.25200286507606506, + "step": 5930 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.1941, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "grad_norm": 2.0744800567626953, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "learning_rate": 0.0007421576424589287, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.2190189361572266, + "step": 5940 + }, + { + "ce_loss": 0.31040140986442566, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.5107767581939697, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.19094134867191315, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.0603785514831543, + "step": 5940 + }, + { + "ce_loss": 0.3433583676815033, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.46275004744529724, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.25405070185661316, + "step": 5940 + }, + { + "epoch": 1.9846564376250835, + "loss": 1.1856, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "grad_norm": 1.5472403764724731, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "learning_rate": 0.0007419336085827664, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 1.1374337673187256, + "step": 5950 + }, + { + "ce_loss": 0.27851608395576477, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.44976139068603516, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.2229611575603485, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 1.1625683307647705, + "step": 5950 + }, + { + "ce_loss": 0.36853551864624023, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.4269828796386719, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.2808544933795929, + "step": 5950 + }, + { + "epoch": 1.9879919946631088, + "loss": 1.2058, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "grad_norm": 1.7128957509994507, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "learning_rate": 0.0007417091756332892, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 1.47935950756073, + "step": 5960 + }, + { + "ce_loss": 0.33494773507118225, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.5610784292221069, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.2407287359237671, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 1.112879991531372, + "step": 5960 + }, + { + "ce_loss": 0.3090471923351288, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.5229696035385132, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.2024012953042984, + "step": 5960 + }, + { + "epoch": 1.9913275517011342, + "loss": 1.1274, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "grad_norm": 7.496943473815918, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "learning_rate": 0.0007414843438724346, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 1.0705974102020264, + "step": 5970 + }, + { + "ce_loss": 0.2735450267791748, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.4013446271419525, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.22743751108646393, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 1.3012447357177734, + "step": 5970 + }, + { + "ce_loss": 0.3357171416282654, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.48471227288246155, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.2686719298362732, + "step": 5970 + }, + { + "epoch": 1.9946631087391595, + "loss": 1.2532, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "grad_norm": 2.4035911560058594, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "learning_rate": 0.0007412591135626056, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 1.2545922994613647, + "step": 5980 + }, + { + "ce_loss": 0.3356133699417114, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.40619805455207825, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.2650473713874817, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 1.198852300643921, + "step": 5980 + }, + { + "ce_loss": 0.31099480390548706, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.37024760246276855, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.22776618599891663, + "step": 5980 + }, + { + "epoch": 1.9979986657771849, + "loss": 1.145, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "grad_norm": 1.7393022775650024, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "learning_rate": 0.0007410334849666699, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 1.2356904745101929, + "step": 5990 + }, + { + "ce_loss": 0.34358200430870056, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.4329187870025635, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.22351956367492676, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.9917151927947998, + "step": 5990 + }, + { + "ce_loss": 0.3190557658672333, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.4740646779537201, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.19807963073253632, + "step": 5990 + }, + { + "epoch": 2.0013342228152102, + "loss": 1.1926, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "grad_norm": 2.1108322143554688, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "learning_rate": 0.0007408074583479605, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 1.3016316890716553, + "step": 6000 + }, + { + "ce_loss": 0.3355659544467926, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.5252490043640137, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.21098066866397858, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 1.3800817728042603, + "step": 6000 + }, + { + "ce_loss": 0.3436371088027954, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.4823884963989258, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.2499440461397171, + "step": 6000 + }, + { + "epoch": 2.0046697798532356, + "loss": 1.1643, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "grad_norm": 2.1682538986206055, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "learning_rate": 0.0007405810339702751, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 1.1670124530792236, + "step": 6010 + }, + { + "ce_loss": 0.2908676862716675, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.44854360818862915, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.2258313000202179, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 1.1812759637832642, + "step": 6010 + }, + { + "ce_loss": 0.27601760625839233, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.5683032274246216, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.24193869531154633, + "step": 6010 + }, + { + "epoch": 2.008005336891261, + "loss": 1.1144, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "grad_norm": 2.0215084552764893, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "learning_rate": 0.0007403542120978747, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 1.12555992603302, + "step": 6020 + }, + { + "ce_loss": 0.22603082656860352, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.45316022634506226, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.18625034391880035, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 1.530777931213379, + "step": 6020 + }, + { + "ce_loss": 0.37890389561653137, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.6050182580947876, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.25829702615737915, + "step": 6020 + }, + { + "epoch": 2.0113408939292863, + "loss": 1.2366, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "grad_norm": 2.093440055847168, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "learning_rate": 0.0007401269929954853, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 1.0343910455703735, + "step": 6030 + }, + { + "ce_loss": 0.24603171646595, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.5099126100540161, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.21466688811779022, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.9694656729698181, + "step": 6030 + }, + { + "ce_loss": 0.27798622846603394, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.3973238468170166, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.2442408949136734, + "step": 6030 + }, + { + "epoch": 2.0146764509673116, + "loss": 1.0951, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "grad_norm": 1.586016297340393, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "learning_rate": 0.0007398993769282959, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 1.0159122943878174, + "step": 6040 + }, + { + "ce_loss": 0.2615586221218109, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.44198697805404663, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.24152494966983795, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 0.9031857252120972, + "step": 6040 + }, + { + "ce_loss": 0.26589229702949524, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.4166644513607025, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.22043243050575256, + "step": 6040 + }, + { + "epoch": 2.018012008005337, + "loss": 1.1342, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "grad_norm": 3.1845755577087402, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "learning_rate": 0.0007396713641619588, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 1.1987489461898804, + "step": 6050 + }, + { + "ce_loss": 0.3881857693195343, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.4965546131134033, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.2514057755470276, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 0.9910258054733276, + "step": 6050 + }, + { + "ce_loss": 0.2973671853542328, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.4325452744960785, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.18317049741744995, + "step": 6050 + }, + { + "epoch": 2.0213475650433623, + "loss": 1.0778, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "grad_norm": 1.746089220046997, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "learning_rate": 0.0007394429549625898, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.892999529838562, + "step": 6060 + }, + { + "ce_loss": 0.2477852702140808, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.45223918557167053, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.19280417263507843, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 1.0602220296859741, + "step": 6060 + }, + { + "ce_loss": 0.32873767614364624, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.4844486117362976, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.18138428032398224, + "step": 6060 + }, + { + "epoch": 2.0246831220813877, + "loss": 1.143, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "grad_norm": 3.508035898208618, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "learning_rate": 0.0007392141495967666, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 1.2129606008529663, + "step": 6070 + }, + { + "ce_loss": 0.3241889774799347, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.505097508430481, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.19585412740707397, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 1.2293137311935425, + "step": 6070 + }, + { + "ce_loss": 0.36307594180107117, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.4744553565979004, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.29934239387512207, + "step": 6070 + }, + { + "epoch": 2.028018679119413, + "loss": 1.1642, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "grad_norm": 2.075115919113159, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "learning_rate": 0.00073898494833153, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 0.8387904167175293, + "step": 6080 + }, + { + "ce_loss": 0.2569155991077423, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.4047635495662689, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.17692382633686066, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 1.2788372039794922, + "step": 6080 + }, + { + "ce_loss": 0.41565200686454773, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.4378134310245514, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.2810961604118347, + "step": 6080 + }, + { + "epoch": 2.0313542361574384, + "loss": 1.1467, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "grad_norm": 1.7534302473068237, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "learning_rate": 0.0007387553514343824, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.9590702652931213, + "step": 6090 + }, + { + "ce_loss": 0.23299598693847656, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.42754676938056946, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.20577183365821838, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.794579803943634, + "step": 6090 + }, + { + "ce_loss": 0.19693799316883087, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.4145779609680176, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.18288493156433105, + "step": 6090 + }, + { + "epoch": 2.0346897931954637, + "loss": 1.0794, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "grad_norm": 2.150377035140991, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "learning_rate": 0.000738525359173288, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 1.006495475769043, + "step": 6100 + }, + { + "ce_loss": 0.2969522476196289, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.39455148577690125, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.24069784581661224, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 1.1346238851547241, + "step": 6100 + }, + { + "ce_loss": 0.28026166558265686, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.4355485439300537, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.1831492930650711, + "step": 6100 + }, + { + "epoch": 2.038025350233489, + "loss": 1.0818, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "grad_norm": 2.7147910594940186, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "learning_rate": 0.0007382949718166726, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 1.0828402042388916, + "step": 6110 + }, + { + "ce_loss": 0.2484767884016037, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.46675872802734375, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.18916720151901245, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 1.155005931854248, + "step": 6110 + }, + { + "ce_loss": 0.2683473229408264, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.44412925839424133, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.19835783541202545, + "step": 6110 + }, + { + "epoch": 2.0413609072715144, + "loss": 1.1626, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "grad_norm": 2.2923474311828613, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "learning_rate": 0.0007380641896334231, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 1.27774977684021, + "step": 6120 + }, + { + "ce_loss": 0.40639743208885193, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.4476247727870941, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.30624353885650635, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 1.1386736631393433, + "step": 6120 + }, + { + "ce_loss": 0.2917673885822296, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.4815749526023865, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.2240479290485382, + "step": 6120 + }, + { + "epoch": 2.0446964643095398, + "loss": 1.0721, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "grad_norm": 1.971584439277649, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "learning_rate": 0.0007378330128928871, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 1.1431065797805786, + "step": 6130 + }, + { + "ce_loss": 0.3328869938850403, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.5115488767623901, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.23866187036037445, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 1.0687028169631958, + "step": 6130 + }, + { + "ce_loss": 0.33602792024612427, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.4408641457557678, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.21772147715091705, + "step": 6130 + }, + { + "epoch": 2.048032021347565, + "loss": 1.0638, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "grad_norm": 1.7937724590301514, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "learning_rate": 0.0007376014418648727, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 1.265535593032837, + "step": 6140 + }, + { + "ce_loss": 0.2526908814907074, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.37703099846839905, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.2018091231584549, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 1.0709772109985352, + "step": 6140 + }, + { + "ce_loss": 0.27110448479652405, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.39831891655921936, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.2335970103740692, + "step": 6140 + }, + { + "epoch": 2.0513675783855905, + "loss": 1.1445, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "grad_norm": 1.4948005676269531, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "learning_rate": 0.0007373694768196481, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 1.0971550941467285, + "step": 6150 + }, + { + "ce_loss": 0.29519185423851013, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.5128030180931091, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.21166488528251648, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.9408478736877441, + "step": 6150 + }, + { + "ce_loss": 0.29984796047210693, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.4378625452518463, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.20278240740299225, + "step": 6150 + }, + { + "epoch": 2.054703135423616, + "loss": 1.0888, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "grad_norm": 2.1178483963012695, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "learning_rate": 0.0007371371180279417, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 1.3761086463928223, + "step": 6160 + }, + { + "ce_loss": 0.44446414709091187, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.5409948825836182, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.2181801199913025, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 1.0366450548171997, + "step": 6160 + }, + { + "ce_loss": 0.25601834058761597, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.39888685941696167, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.22890298068523407, + "step": 6160 + }, + { + "epoch": 2.058038692461641, + "loss": 1.0642, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "grad_norm": 2.5415828227996826, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "learning_rate": 0.0007369043657609412, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 0.9021316170692444, + "step": 6170 + }, + { + "ce_loss": 0.22570855915546417, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.399993896484375, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.2156895399093628, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 1.0900871753692627, + "step": 6170 + }, + { + "ce_loss": 0.31268736720085144, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.5308760404586792, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.18112565577030182, + "step": 6170 + }, + { + "epoch": 2.0613742494996665, + "loss": 1.0631, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "grad_norm": 1.4687033891677856, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "learning_rate": 0.0007366712202902933, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.9915813207626343, + "step": 6180 + }, + { + "ce_loss": 0.24520814418792725, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.40753301978111267, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.18381255865097046, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 1.2639784812927246, + "step": 6180 + }, + { + "ce_loss": 0.37111666798591614, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.4548932611942291, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.24587491154670715, + "step": 6180 + }, + { + "epoch": 2.064709806537692, + "loss": 1.2493, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "grad_norm": 1.4651398658752441, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "learning_rate": 0.0007364376818881042, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 1.2870615720748901, + "step": 6190 + }, + { + "ce_loss": 0.31373345851898193, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.5066043138504028, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.25695517659187317, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 1.2238523960113525, + "step": 6190 + }, + { + "ce_loss": 0.34804919362068176, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.5069231986999512, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.25139155983924866, + "step": 6190 + }, + { + "epoch": 2.068045363575717, + "loss": 1.1388, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "grad_norm": 1.8307620286941528, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "learning_rate": 0.000736203750826938, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 1.0272313356399536, + "step": 6200 + }, + { + "ce_loss": 0.26185503602027893, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.4729871153831482, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.22979292273521423, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 1.1139501333236694, + "step": 6200 + }, + { + "ce_loss": 0.35409292578697205, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.460551381111145, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.23284848034381866, + "step": 6200 + }, + { + "epoch": 2.0713809206137426, + "loss": 1.0942, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "grad_norm": 1.6087286472320557, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "learning_rate": 0.0007359694273798175, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 1.0432158708572388, + "step": 6210 + }, + { + "ce_loss": 0.28722330927848816, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.44095054268836975, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.21269214153289795, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 0.9859718680381775, + "step": 6210 + }, + { + "ce_loss": 0.2458011358976364, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.3622279167175293, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.18419009447097778, + "step": 6210 + }, + { + "epoch": 2.074716477651768, + "loss": 1.0005, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "grad_norm": 2.0101234912872314, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "learning_rate": 0.0007357347118202235, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 0.9655289649963379, + "step": 6220 + }, + { + "ce_loss": 0.3007512092590332, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.41682693362236023, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.2477555274963379, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 1.1301460266113281, + "step": 6220 + }, + { + "ce_loss": 0.2814660966396332, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.38158291578292847, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.18453769385814667, + "step": 6220 + }, + { + "epoch": 2.0780520346897933, + "loss": 1.0704, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "grad_norm": 2.3904693126678467, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "learning_rate": 0.0007354996044220942, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 1.1304408311843872, + "step": 6230 + }, + { + "ce_loss": 0.29919639229774475, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.43664121627807617, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.2104227989912033, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.8765370845794678, + "step": 6230 + }, + { + "ce_loss": 0.26280224323272705, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.4388915002346039, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.17453253269195557, + "step": 6230 + }, + { + "epoch": 2.0813875917278186, + "loss": 1.1378, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "grad_norm": 2.003026247024536, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "learning_rate": 0.0007352641054598253, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 1.1472810506820679, + "step": 6240 + }, + { + "ce_loss": 0.31809720396995544, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.5361602306365967, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.19889040291309357, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 1.067832112312317, + "step": 6240 + }, + { + "ce_loss": 0.3019455373287201, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.46684446930885315, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.23338524997234344, + "step": 6240 + }, + { + "epoch": 2.084723148765844, + "loss": 1.0864, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "grad_norm": 1.7693291902542114, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "learning_rate": 0.0007350282152082695, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 1.0742148160934448, + "step": 6250 + }, + { + "ce_loss": 0.27459150552749634, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.525519073009491, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.21103417873382568, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 1.170702338218689, + "step": 6250 + }, + { + "ce_loss": 0.388058066368103, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.43670716881752014, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.2780489921569824, + "step": 6250 + }, + { + "epoch": 2.0880587058038693, + "loss": 1.0828, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "grad_norm": 1.7592650651931763, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "learning_rate": 0.000734791933942736, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.9743472933769226, + "step": 6260 + }, + { + "ce_loss": 0.28358349204063416, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.48276543617248535, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.20776033401489258, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 1.0317065715789795, + "step": 6260 + }, + { + "ce_loss": 0.3378918170928955, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.40934500098228455, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.19641610980033875, + "step": 6260 + }, + { + "epoch": 2.0913942628418947, + "loss": 1.0748, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "grad_norm": 1.7081412076950073, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "learning_rate": 0.0007345552619389906, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 1.1185240745544434, + "step": 6270 + }, + { + "ce_loss": 0.3204095661640167, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.5713546872138977, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.2265942245721817, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 1.4075984954833984, + "step": 6270 + }, + { + "ce_loss": 0.3697609007358551, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.5672245025634766, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.2941744029521942, + "step": 6270 + }, + { + "epoch": 2.09472981987992, + "loss": 1.1678, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "grad_norm": 1.6775950193405151, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "learning_rate": 0.0007343181994732547, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 1.2271791696548462, + "step": 6280 + }, + { + "ce_loss": 0.2992215156555176, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.5207251906394958, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.3127952516078949, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 1.2772680521011353, + "step": 6280 + }, + { + "ce_loss": 0.34195807576179504, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.5496068596839905, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.2772253453731537, + "step": 6280 + }, + { + "epoch": 2.0980653769179454, + "loss": 1.1138, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "grad_norm": 1.9288606643676758, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "learning_rate": 0.000734080746822206, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.9897034764289856, + "step": 6290 + }, + { + "ce_loss": 0.32950952649116516, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.3844912648200989, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.2242661714553833, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 1.3939342498779297, + "step": 6290 + }, + { + "ce_loss": 0.35396263003349304, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.5109117031097412, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.22464288771152496, + "step": 6290 + }, + { + "epoch": 2.1014009339559707, + "loss": 1.1161, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "grad_norm": 2.2423195838928223, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "learning_rate": 0.0007338429042629772, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 1.138380527496338, + "step": 6300 + }, + { + "ce_loss": 0.2688751220703125, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.428027868270874, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.21883095800876617, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 1.9510817527770996, + "step": 6300 + }, + { + "ce_loss": 0.3263424038887024, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.41397032141685486, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.24076981842517853, + "step": 6300 + }, + { + "epoch": 2.104736490993996, + "loss": 1.1093, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "grad_norm": 2.5436060428619385, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "learning_rate": 0.0007336046720731559, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 1.1591858863830566, + "step": 6310 + }, + { + "ce_loss": 0.26692333817481995, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.436357319355011, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.23056459426879883, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 1.1801612377166748, + "step": 6310 + }, + { + "ce_loss": 0.23928722739219666, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.42476698756217957, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.18285700678825378, + "step": 6310 + }, + { + "epoch": 2.1080720480320214, + "loss": 1.0749, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "grad_norm": 9.392084121704102, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "learning_rate": 0.0007333660505307852, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 1.1425275802612305, + "step": 6320 + }, + { + "ce_loss": 0.3087732195854187, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.44689956307411194, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.27974626421928406, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 1.2156401872634888, + "step": 6320 + }, + { + "ce_loss": 0.3612198233604431, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.41263580322265625, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.23722223937511444, + "step": 6320 + }, + { + "epoch": 2.1114076050700468, + "loss": 1.1452, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "grad_norm": 2.4938435554504395, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "learning_rate": 0.0007331270399143618, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 1.1720032691955566, + "step": 6330 + }, + { + "ce_loss": 0.35546770691871643, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.47783389687538147, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.27097728848457336, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 1.4047222137451172, + "step": 6330 + }, + { + "ce_loss": 0.2705877721309662, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.4400711953639984, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.2454250603914261, + "step": 6330 + }, + { + "epoch": 2.114743162108072, + "loss": 1.1854, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "grad_norm": 2.1468722820281982, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "learning_rate": 0.0007328876405028367, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 1.0781439542770386, + "step": 6340 + }, + { + "ce_loss": 0.28617146611213684, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.4413968026638031, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.21744798123836517, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 1.0336925983428955, + "step": 6340 + }, + { + "ce_loss": 0.2718295753002167, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.4237961173057556, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.20000006258487701, + "step": 6340 + }, + { + "epoch": 2.1180787191460975, + "loss": 1.13, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "grad_norm": 2.2106473445892334, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "learning_rate": 0.0007326478525756151, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.8444902896881104, + "step": 6350 + }, + { + "ce_loss": 0.21335484087467194, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.3180597722530365, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.14585541188716888, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.9812553524971008, + "step": 6350 + }, + { + "ce_loss": 0.2725522816181183, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.44498321413993835, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.2124388962984085, + "step": 6350 + }, + { + "epoch": 2.121414276184123, + "loss": 1.1336, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "grad_norm": 1.6461633443832397, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "learning_rate": 0.0007324076764125552, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 0.9389811158180237, + "step": 6360 + }, + { + "ce_loss": 0.26564937829971313, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.4470508098602295, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.1644158661365509, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 1.4932483434677124, + "step": 6360 + }, + { + "ce_loss": 0.40737444162368774, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.660778820514679, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.260358065366745, + "step": 6360 + }, + { + "epoch": 2.124749833222148, + "loss": 1.107, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "grad_norm": 1.6320087909698486, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "learning_rate": 0.0007321671122939684, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 0.9644901752471924, + "step": 6370 + }, + { + "ce_loss": 0.24344997107982635, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.4070049226284027, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.24330593645572662, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 0.6866275072097778, + "step": 6370 + }, + { + "ce_loss": 0.19747519493103027, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.3194316029548645, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.16923660039901733, + "step": 6370 + }, + { + "epoch": 2.1280853902601735, + "loss": 1.0816, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "grad_norm": 1.6301352977752686, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "learning_rate": 0.0007319261605006188, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 1.198629379272461, + "step": 6380 + }, + { + "ce_loss": 0.286978155374527, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.46526041626930237, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.22198054194450378, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 1.8673536777496338, + "step": 6380 + }, + { + "ce_loss": 0.36875566840171814, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.549709677696228, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.2909049093723297, + "step": 6380 + }, + { + "epoch": 2.131420947298199, + "loss": 1.2046, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "grad_norm": 1.5458937883377075, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "learning_rate": 0.0007316848213137231, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 0.9795043468475342, + "step": 6390 + }, + { + "ce_loss": 0.25236889719963074, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.38965165615081787, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.19905973970890045, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 1.0322065353393555, + "step": 6390 + }, + { + "ce_loss": 0.29902148246765137, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.5339668989181519, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.19888225197792053, + "step": 6390 + }, + { + "epoch": 2.134756504336224, + "loss": 1.1691, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "grad_norm": 1.7552231550216675, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "learning_rate": 0.0007314430950149502, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 1.2230530977249146, + "step": 6400 + }, + { + "ce_loss": 0.22836093604564667, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.3304770886898041, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.2264941781759262, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 0.9575164318084717, + "step": 6400 + }, + { + "ce_loss": 0.2918906509876251, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.34815043210983276, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.25119641423225403, + "step": 6400 + }, + { + "epoch": 2.1380920613742496, + "loss": 1.1057, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "grad_norm": 3.011944532394409, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "learning_rate": 0.0007312009818864209, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 1.1187481880187988, + "step": 6410 + }, + { + "ce_loss": 0.3483152687549591, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.4989229142665863, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.20267225801944733, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.8301767706871033, + "step": 6410 + }, + { + "ce_loss": 0.2062799632549286, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.3910866975784302, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.177321195602417, + "step": 6410 + }, + { + "epoch": 2.141427618412275, + "loss": 1.0847, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "grad_norm": 1.7740399837493896, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "learning_rate": 0.0007309584822107068, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 1.1315577030181885, + "step": 6420 + }, + { + "ce_loss": 0.32394424080848694, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.4168577790260315, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.24851790070533752, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 1.1237907409667969, + "step": 6420 + }, + { + "ce_loss": 0.3055866062641144, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.46554845571517944, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.2067885845899582, + "step": 6420 + }, + { + "epoch": 2.1447631754503003, + "loss": 1.0649, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "grad_norm": 2.0649614334106445, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "learning_rate": 0.0007307155962708314, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.9748170971870422, + "step": 6430 + }, + { + "ce_loss": 0.25578707456588745, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.45131832361221313, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.21652290225028992, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 1.2006250619888306, + "step": 6430 + }, + { + "ce_loss": 0.3162148594856262, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.47678422927856445, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.3085026443004608, + "step": 6430 + }, + { + "epoch": 2.1480987324883256, + "loss": 1.1302, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "grad_norm": 2.0026278495788574, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "learning_rate": 0.0007304723243502686, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.9228730797767639, + "step": 6440 + }, + { + "ce_loss": 0.21623915433883667, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.4748040735721588, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.22128619253635406, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.9328144788742065, + "step": 6440 + }, + { + "ce_loss": 0.22637756168842316, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.4844534397125244, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.2073233723640442, + "step": 6440 + }, + { + "epoch": 2.151434289526351, + "loss": 1.1879, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "grad_norm": 1.8786402940750122, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "learning_rate": 0.000730228666732943, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.9601532220840454, + "step": 6450 + }, + { + "ce_loss": 0.24260313808918, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.3780670464038849, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.18927910923957825, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.9228167533874512, + "step": 6450 + }, + { + "ce_loss": 0.3097418546676636, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.4570564031600952, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.15541477501392365, + "step": 6450 + }, + { + "epoch": 2.1547698465643763, + "loss": 1.113, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "grad_norm": 2.095228672027588, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "learning_rate": 0.0007299846237032293, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 1.3037450313568115, + "step": 6460 + }, + { + "ce_loss": 0.21096457540988922, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.4512532353401184, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.19502690434455872, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 1.120269775390625, + "step": 6460 + }, + { + "ce_loss": 0.325928658246994, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.4305892586708069, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.24968509376049042, + "step": 6460 + }, + { + "epoch": 2.1581054036024017, + "loss": 1.1734, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "grad_norm": 1.8886241912841797, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "learning_rate": 0.000729740195545952, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.9583158493041992, + "step": 6470 + }, + { + "ce_loss": 0.2795897126197815, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.41173821687698364, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.2124219685792923, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 1.1033215522766113, + "step": 6470 + }, + { + "ce_loss": 0.39897602796554565, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.4334798753261566, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.2701454758644104, + "step": 6470 + }, + { + "epoch": 2.161440960640427, + "loss": 1.152, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "grad_norm": 1.923465371131897, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "learning_rate": 0.0007294953825463849, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 1.3026888370513916, + "step": 6480 + }, + { + "ce_loss": 0.35526034235954285, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.5430156588554382, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.282005250453949, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 0.9665287733078003, + "step": 6480 + }, + { + "ce_loss": 0.27917689085006714, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.4497480094432831, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.18187867105007172, + "step": 6480 + }, + { + "epoch": 2.1647765176784524, + "loss": 1.0982, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "grad_norm": 1.5525462627410889, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "learning_rate": 0.0007292501849902513, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.9274162650108337, + "step": 6490 + }, + { + "ce_loss": 0.22487910091876984, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.3908233642578125, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.2260996699333191, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 1.2589856386184692, + "step": 6490 + }, + { + "ce_loss": 0.41789162158966064, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.5558347105979919, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.27965062856674194, + "step": 6490 + }, + { + "epoch": 2.1681120747164777, + "loss": 1.1382, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "grad_norm": 1.6293458938598633, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "learning_rate": 0.0007290046031637229, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 1.054274320602417, + "step": 6500 + }, + { + "ce_loss": 0.2959146797657013, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.4802056849002838, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.2115231305360794, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 1.2698650360107422, + "step": 6500 + }, + { + "ce_loss": 0.31689831614494324, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.42682600021362305, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.2237817496061325, + "step": 6500 + }, + { + "epoch": 2.171447631754503, + "loss": 1.1373, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "grad_norm": 1.8974462747573853, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "learning_rate": 0.0007287586373534202, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 1.0428264141082764, + "step": 6510 + }, + { + "ce_loss": 0.26314231753349304, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.4809033274650574, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.20492221415042877, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 1.293421983718872, + "step": 6510 + }, + { + "ce_loss": 0.3857915699481964, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.6188392043113708, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.2295541614294052, + "step": 6510 + }, + { + "epoch": 2.1747831887925284, + "loss": 1.1598, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "grad_norm": 2.1783154010772705, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "learning_rate": 0.000728512287846412, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 1.2277634143829346, + "step": 6520 + }, + { + "ce_loss": 0.21726199984550476, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.5520312786102295, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.17318247258663177, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.9285261631011963, + "step": 6520 + }, + { + "ce_loss": 0.22456702589988708, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.4865211546421051, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.1675080806016922, + "step": 6520 + }, + { + "epoch": 2.1781187458305538, + "loss": 1.1704, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "grad_norm": 1.9148839712142944, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "learning_rate": 0.0007282655549302144, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 1.0659818649291992, + "step": 6530 + }, + { + "ce_loss": 0.29880058765411377, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.4344814717769623, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.16811010241508484, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.9991016387939453, + "step": 6530 + }, + { + "ce_loss": 0.3414323627948761, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.40579909086227417, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.25159013271331787, + "step": 6530 + }, + { + "epoch": 2.181454302868579, + "loss": 1.0964, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "grad_norm": 1.865465521812439, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "learning_rate": 0.0007280184388927914, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 1.025991439819336, + "step": 6540 + }, + { + "ce_loss": 0.3279627561569214, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.3677094578742981, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.25373217463493347, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 1.0250794887542725, + "step": 6540 + }, + { + "ce_loss": 0.304439514875412, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.41830167174339294, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.19563069939613342, + "step": 6540 + }, + { + "epoch": 2.1847898599066045, + "loss": 1.0431, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "grad_norm": 2.291187047958374, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "learning_rate": 0.000727770940022554, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 1.1050126552581787, + "step": 6550 + }, + { + "ce_loss": 0.3487250804901123, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.46970731019973755, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.20708809792995453, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 1.2422537803649902, + "step": 6550 + }, + { + "ce_loss": 0.3275320827960968, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.5561754703521729, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.24336011707782745, + "step": 6550 + }, + { + "epoch": 2.18812541694463, + "loss": 1.1976, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "grad_norm": 1.8396140336990356, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "learning_rate": 0.0007275230586083598, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 1.2439719438552856, + "step": 6560 + }, + { + "ce_loss": 0.2645409107208252, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.5173214673995972, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.23124998807907104, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 1.1374619007110596, + "step": 6560 + }, + { + "ce_loss": 0.2659108340740204, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.5399746894836426, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.2251754105091095, + "step": 6560 + }, + { + "epoch": 2.191460973982655, + "loss": 1.1713, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "grad_norm": 3.3113059997558594, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "learning_rate": 0.0007272747949395134, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 1.1385853290557861, + "step": 6570 + }, + { + "ce_loss": 0.23356357216835022, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.4046405553817749, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.21334941685199738, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 0.8915872573852539, + "step": 6570 + }, + { + "ce_loss": 0.2360234260559082, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.3655630350112915, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.20562811195850372, + "step": 6570 + }, + { + "epoch": 2.1947965310206805, + "loss": 1.0959, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "grad_norm": 5.928073406219482, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "learning_rate": 0.0007270261493057652, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.8915174007415771, + "step": 6580 + }, + { + "ce_loss": 0.20917125046253204, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.3442421853542328, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.16487248241901398, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 1.161110758781433, + "step": 6580 + }, + { + "ce_loss": 0.28940436244010925, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.4610818028450012, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.23252765834331512, + "step": 6580 + }, + { + "epoch": 2.198132088058706, + "loss": 1.1057, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "grad_norm": 1.954878330230713, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "learning_rate": 0.000726777121997311, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 0.9378560781478882, + "step": 6590 + }, + { + "ce_loss": 0.25078916549682617, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.426094114780426, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.26082292199134827, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 1.06697678565979, + "step": 6590 + }, + { + "ce_loss": 0.2149425894021988, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.44963303208351135, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.21024475991725922, + "step": 6590 + }, + { + "epoch": 2.201467645096731, + "loss": 1.0868, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "grad_norm": 1.9652372598648071, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "learning_rate": 0.000726527713304793, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 0.7863520383834839, + "step": 6600 + }, + { + "ce_loss": 0.22365817427635193, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.3934679925441742, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.16903427243232727, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 1.895040512084961, + "step": 6600 + }, + { + "ce_loss": 0.2216983139514923, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.40213045477867126, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.18472428619861603, + "step": 6600 + }, + { + "epoch": 2.2048032021347566, + "loss": 1.101, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "grad_norm": 4.147248268127441, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "learning_rate": 0.0007262779235192977, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 1.2284412384033203, + "step": 6610 + }, + { + "ce_loss": 0.3198404908180237, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.45762690901756287, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.22917161881923676, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.9958036541938782, + "step": 6610 + }, + { + "ce_loss": 0.2828834652900696, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.4549398422241211, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.18840783834457397, + "step": 6610 + }, + { + "epoch": 2.208138759172782, + "loss": 1.033, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "grad_norm": 3.1193883419036865, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "learning_rate": 0.0007260277529323565, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 0.9063237905502319, + "step": 6620 + }, + { + "ce_loss": 0.26534974575042725, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.3766826391220093, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.19527144730091095, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 1.1124207973480225, + "step": 6620 + }, + { + "ce_loss": 0.33875322341918945, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.4154742956161499, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.2512235939502716, + "step": 6620 + }, + { + "epoch": 2.2114743162108073, + "loss": 1.1578, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "grad_norm": 1.9319946765899658, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "learning_rate": 0.0007257772018359458, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 1.1475681066513062, + "step": 6630 + }, + { + "ce_loss": 0.34494549036026, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.5608721971511841, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.18402761220932007, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 1.2129950523376465, + "step": 6630 + }, + { + "ce_loss": 0.3182290196418762, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.4504798650741577, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.264813631772995, + "step": 6630 + }, + { + "epoch": 2.2148098732488326, + "loss": 1.068, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "grad_norm": 1.38816499710083, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "learning_rate": 0.0007255262705224854, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 1.1173572540283203, + "step": 6640 + }, + { + "ce_loss": 0.31794416904449463, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.5177514553070068, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.20269501209259033, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 1.145826816558838, + "step": 6640 + }, + { + "ce_loss": 0.3013113737106323, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.43260765075683594, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.22437791526317596, + "step": 6640 + }, + { + "epoch": 2.218145430286858, + "loss": 1.0474, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "grad_norm": 1.715024709701538, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "learning_rate": 0.0007252749592848392, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 0.9926832318305969, + "step": 6650 + }, + { + "ce_loss": 0.2870025634765625, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.4564557671546936, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.1875499188899994, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 1.396606206893921, + "step": 6650 + }, + { + "ce_loss": 0.2882598042488098, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.4250618815422058, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.20096905529499054, + "step": 6650 + }, + { + "epoch": 2.2214809873248833, + "loss": 1.1115, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "grad_norm": 1.6308581829071045, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "learning_rate": 0.0007250232684163146, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 1.1638392210006714, + "step": 6660 + }, + { + "ce_loss": 0.32330602407455444, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.4661082923412323, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.24704022705554962, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.839045524597168, + "step": 6660 + }, + { + "ce_loss": 0.24197931587696075, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.3925042748451233, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.2042776197195053, + "step": 6660 + }, + { + "epoch": 2.2248165443629087, + "loss": 1.1374, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "grad_norm": 2.5563485622406006, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "learning_rate": 0.0007247711982106618, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 1.0619388818740845, + "step": 6670 + }, + { + "ce_loss": 0.33162665367126465, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.4590032696723938, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.2071804404258728, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 1.3315930366516113, + "step": 6670 + }, + { + "ce_loss": 0.3238199055194855, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.5190696716308594, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.25709268450737, + "step": 6670 + }, + { + "epoch": 2.228152101400934, + "loss": 1.1388, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "grad_norm": 2.195162296295166, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "learning_rate": 0.0007245187489620736, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 1.386202335357666, + "step": 6680 + }, + { + "ce_loss": 0.2633619010448456, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.3492848575115204, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.24636389315128326, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 1.4848406314849854, + "step": 6680 + }, + { + "ce_loss": 0.23591652512550354, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.4073147177696228, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.16064199805259705, + "step": 6680 + }, + { + "epoch": 2.2314876584389594, + "loss": 1.0865, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "grad_norm": 2.414093494415283, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "learning_rate": 0.000724265920965186, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.8304829597473145, + "step": 6690 + }, + { + "ce_loss": 0.22507025301456451, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.3573000729084015, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.1854206770658493, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.9495757818222046, + "step": 6690 + }, + { + "ce_loss": 0.24371907114982605, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.39758509397506714, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.22929032146930695, + "step": 6690 + }, + { + "epoch": 2.2348232154769847, + "loss": 1.1242, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "grad_norm": 2.162372350692749, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "learning_rate": 0.000724012714515076, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 1.1144814491271973, + "step": 6700 + }, + { + "ce_loss": 0.35344523191452026, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.4919400215148926, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.2676081657409668, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 1.06876540184021, + "step": 6700 + }, + { + "ce_loss": 0.3048124611377716, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.5308917760848999, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.23223397135734558, + "step": 6700 + }, + { + "epoch": 2.23815877251501, + "loss": 1.0419, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "grad_norm": 1.5892221927642822, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "learning_rate": 0.000723759129907263, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 0.8763984441757202, + "step": 6710 + }, + { + "ce_loss": 0.21821273863315582, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.4226665496826172, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.17459844052791595, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 1.2446012496948242, + "step": 6710 + }, + { + "ce_loss": 0.29771167039871216, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.4279354214668274, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.19987165927886963, + "step": 6710 + }, + { + "epoch": 2.2414943295530354, + "loss": 1.1207, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "grad_norm": 1.7283515930175781, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "learning_rate": 0.0007235051674377076, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 1.7800657749176025, + "step": 6720 + }, + { + "ce_loss": 0.2828550934791565, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.4153803288936615, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.20675329864025116, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 1.069749116897583, + "step": 6720 + }, + { + "ce_loss": 0.3165798783302307, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.39565014839172363, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.21022526919841766, + "step": 6720 + }, + { + "epoch": 2.2448298865910608, + "loss": 1.2058, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "grad_norm": 2.3618218898773193, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "learning_rate": 0.000723250827402811, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.850956380367279, + "step": 6730 + }, + { + "ce_loss": 0.21707533299922943, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.39951059222221375, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.16816748678684235, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 1.2139538526535034, + "step": 6730 + }, + { + "ce_loss": 0.27926763892173767, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.40093597769737244, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.25574299693107605, + "step": 6730 + }, + { + "epoch": 2.248165443629086, + "loss": 1.1898, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "grad_norm": 2.773587703704834, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "learning_rate": 0.0007229961100994156, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 1.0061744451522827, + "step": 6740 + }, + { + "ce_loss": 0.29060429334640503, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.4561101794242859, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.2093733251094818, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 0.8028789162635803, + "step": 6740 + }, + { + "ce_loss": 0.21336449682712555, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.3714117705821991, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.14982610940933228, + "step": 6740 + }, + { + "epoch": 2.2515010006671115, + "loss": 1.0326, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "grad_norm": 1.863098382949829, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "learning_rate": 0.0007227410158248041, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 1.0014573335647583, + "step": 6750 + }, + { + "ce_loss": 0.2929192781448364, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.3995145857334137, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.2523522675037384, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.8479694724082947, + "step": 6750 + }, + { + "ce_loss": 0.2661614418029785, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.33442169427871704, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.1988968700170517, + "step": 6750 + }, + { + "epoch": 2.254836557705137, + "loss": 1.0235, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "grad_norm": 3.644243001937866, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "learning_rate": 0.0007224855448766986, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.9441705942153931, + "step": 6760 + }, + { + "ce_loss": 0.2512178122997284, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.45023685693740845, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.21678780019283295, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.9748396277427673, + "step": 6760 + }, + { + "ce_loss": 0.24539008736610413, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.4820556044578552, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.1842784881591797, + "step": 6760 + }, + { + "epoch": 2.258172114743162, + "loss": 1.0713, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "grad_norm": 3.3986644744873047, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "learning_rate": 0.0007222296975532614, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.990540087223053, + "step": 6770 + }, + { + "ce_loss": 0.19007956981658936, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.43442580103874207, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.21168890595436096, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.8435627222061157, + "step": 6770 + }, + { + "ce_loss": 0.2930505573749542, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.3538534343242645, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.1962963491678238, + "step": 6770 + }, + { + "epoch": 2.2615076717811875, + "loss": 1.1581, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "grad_norm": 3.7728989124298096, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "learning_rate": 0.0007219734741530937, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 1.049188256263733, + "step": 6780 + }, + { + "ce_loss": 0.3014962673187256, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.5301869511604309, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.21730859577655792, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.8995069265365601, + "step": 6780 + }, + { + "ce_loss": 0.2236035019159317, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.38950619101524353, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.22236211597919464, + "step": 6780 + }, + { + "epoch": 2.264843228819213, + "loss": 1.1343, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "grad_norm": 2.17807936668396, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "learning_rate": 0.0007217168749752361, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.7967932224273682, + "step": 6790 + }, + { + "ce_loss": 0.18900160491466522, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.35759422183036804, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.13914166390895844, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.90980064868927, + "step": 6790 + }, + { + "ce_loss": 0.22265170514583588, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.404280424118042, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.22244516015052795, + "step": 6790 + }, + { + "epoch": 2.268178785857238, + "loss": 1.1049, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "grad_norm": 1.7885805368423462, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "learning_rate": 0.0007214599003191671, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 1.1400399208068848, + "step": 6800 + }, + { + "ce_loss": 0.24055874347686768, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.3624168336391449, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.22153976559638977, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 0.7367714643478394, + "step": 6800 + }, + { + "ce_loss": 0.1923925280570984, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.3727942109107971, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.17138351500034332, + "step": 6800 + }, + { + "epoch": 2.2715143428952635, + "loss": 1.1082, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "grad_norm": 2.1015965938568115, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "learning_rate": 0.0007212025504848039, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 1.107190489768982, + "step": 6810 + }, + { + "ce_loss": 0.23536983132362366, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.3510274887084961, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.1961394101381302, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.9859842658042908, + "step": 6810 + }, + { + "ce_loss": 0.31096890568733215, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.4421943724155426, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.1737508475780487, + "step": 6810 + }, + { + "epoch": 2.274849899933289, + "loss": 1.1412, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "grad_norm": 1.7945131063461304, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "learning_rate": 0.0007209448257725015, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 1.0342320203781128, + "step": 6820 + }, + { + "ce_loss": 0.2750062346458435, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.3843204379081726, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.2735709547996521, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 0.9394274950027466, + "step": 6820 + }, + { + "ce_loss": 0.3055344521999359, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.38053858280181885, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.2532532513141632, + "step": 6820 + }, + { + "epoch": 2.2781854569713142, + "loss": 1.0606, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "grad_norm": 2.098090887069702, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "learning_rate": 0.0007206867264830523, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.8209320306777954, + "step": 6830 + }, + { + "ce_loss": 0.2228832095861435, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.3807736933231354, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.1797008067369461, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 1.111886978149414, + "step": 6830 + }, + { + "ce_loss": 0.2975417375564575, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.4608473479747772, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.2796211838722229, + "step": 6830 + }, + { + "epoch": 2.2815210140093396, + "loss": 1.0477, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "grad_norm": 3.1342854499816895, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "learning_rate": 0.000720428252917686, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 1.593949794769287, + "step": 6840 + }, + { + "ce_loss": 0.35022589564323425, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.41967836022377014, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.271299809217453, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 1.1150100231170654, + "step": 6840 + }, + { + "ce_loss": 0.288134902715683, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.4372361898422241, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.22447191178798676, + "step": 6840 + }, + { + "epoch": 2.284856571047365, + "loss": 1.0996, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "grad_norm": 1.9681981801986694, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "learning_rate": 0.000720169405378069, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 1.0952081680297852, + "step": 6850 + }, + { + "ce_loss": 0.2938965857028961, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.4962945580482483, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.17742693424224854, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 1.3066614866256714, + "step": 6850 + }, + { + "ce_loss": 0.4631924033164978, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.6236108541488647, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.21928176283836365, + "step": 6850 + }, + { + "epoch": 2.2881921280853903, + "loss": 1.1893, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "grad_norm": 1.8243825435638428, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "learning_rate": 0.0007199101841663042, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 1.8653347492218018, + "step": 6860 + }, + { + "ce_loss": 0.3484281599521637, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.4665539860725403, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.19580277800559998, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 1.0679538249969482, + "step": 6860 + }, + { + "ce_loss": 0.3303784132003784, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.475466251373291, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.26188352704048157, + "step": 6860 + }, + { + "epoch": 2.2915276851234156, + "loss": 1.2505, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "grad_norm": 3.925416946411133, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "learning_rate": 0.000719650589584931, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 1.038080096244812, + "step": 6870 + }, + { + "ce_loss": 0.2771221697330475, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.44759422540664673, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.23251193761825562, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 1.0797985792160034, + "step": 6870 + }, + { + "ce_loss": 0.29750117659568787, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.48262596130371094, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.18303725123405457, + "step": 6870 + }, + { + "epoch": 2.294863242161441, + "loss": 1.1422, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "grad_norm": 2.3651957511901855, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "learning_rate": 0.0007193906219369236, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 1.2924625873565674, + "step": 6880 + }, + { + "ce_loss": 0.40562325716018677, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.5779322385787964, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.24361947178840637, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 1.1276745796203613, + "step": 6880 + }, + { + "ce_loss": 0.3326677680015564, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.5647445917129517, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.22994059324264526, + "step": 6880 + }, + { + "epoch": 2.2981987991994663, + "loss": 1.1616, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "grad_norm": 2.5007548332214355, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "learning_rate": 0.0007191302815256927, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 1.0267442464828491, + "step": 6890 + }, + { + "ce_loss": 0.3053237497806549, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.44782763719558716, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.22624436020851135, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 1.4283981323242188, + "step": 6890 + }, + { + "ce_loss": 0.3315616846084595, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.5364858508110046, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.22474513947963715, + "step": 6890 + }, + { + "epoch": 2.3015343562374917, + "loss": 1.0545, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "grad_norm": 1.568762183189392, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "learning_rate": 0.0007188695686550835, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.9278749823570251, + "step": 6900 + }, + { + "ce_loss": 0.2324940264225006, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.4496258795261383, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.18112577497959137, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 1.423940896987915, + "step": 6900 + }, + { + "ce_loss": 0.2764613628387451, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.40918517112731934, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.26147595047950745, + "step": 6900 + }, + { + "epoch": 2.304869913275517, + "loss": 1.1087, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "grad_norm": 2.541975498199463, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "learning_rate": 0.0007186084836293757, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.925176203250885, + "step": 6910 + }, + { + "ce_loss": 0.2801961302757263, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.4048868715763092, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.23977307975292206, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.8737236261367798, + "step": 6910 + }, + { + "ce_loss": 0.21469813585281372, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.3422822058200836, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.1523410528898239, + "step": 6910 + }, + { + "epoch": 2.3082054703135424, + "loss": 1.0511, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "grad_norm": 1.754577875137329, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "learning_rate": 0.000718347026753284, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.9576894044876099, + "step": 6920 + }, + { + "ce_loss": 0.22706426680088043, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.407731831073761, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.24513918161392212, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 1.3680813312530518, + "step": 6920 + }, + { + "ce_loss": 0.3933814764022827, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.5302331447601318, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.2798175811767578, + "step": 6920 + }, + { + "epoch": 2.3115410273515677, + "loss": 1.1067, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "grad_norm": 2.897920608520508, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "learning_rate": 0.0007180851983319564, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 1.1339976787567139, + "step": 6930 + }, + { + "ce_loss": 0.3197008967399597, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.5434731841087341, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.21307173371315002, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 1.1253206729888916, + "step": 6930 + }, + { + "ce_loss": 0.2909734547138214, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.42577847838401794, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.2254728227853775, + "step": 6930 + }, + { + "epoch": 2.314876584389593, + "loss": 1.0654, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "grad_norm": 2.6729137897491455, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "learning_rate": 0.000717822998670975, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 0.9879928231239319, + "step": 6940 + }, + { + "ce_loss": 0.2600327432155609, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.5054440498352051, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.1790090799331665, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 1.4961574077606201, + "step": 6940 + }, + { + "ce_loss": 0.29782921075820923, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.5347850322723389, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.20284485816955566, + "step": 6940 + }, + { + "epoch": 2.3182121414276184, + "loss": 1.0843, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "grad_norm": 2.8103513717651367, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "learning_rate": 0.000717560428076355, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 1.302746057510376, + "step": 6950 + }, + { + "ce_loss": 0.38361117243766785, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.5667811036109924, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.2757999002933502, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 1.3873010873794556, + "step": 6950 + }, + { + "ce_loss": 0.34579405188560486, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.5673583745956421, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.2157682180404663, + "step": 6950 + }, + { + "epoch": 2.321547698465644, + "loss": 1.0658, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "grad_norm": 2.6049487590789795, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "learning_rate": 0.0007172974868545445, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 1.1189073324203491, + "step": 6960 + }, + { + "ce_loss": 0.279064416885376, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.5091290473937988, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.18219119310379028, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 1.141963005065918, + "step": 6960 + }, + { + "ce_loss": 0.39140236377716064, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.5031481981277466, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.2467627376317978, + "step": 6960 + }, + { + "epoch": 2.324883255503669, + "loss": 1.0916, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "grad_norm": 2.408190965652466, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "learning_rate": 0.0007170341753124242, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 1.1541481018066406, + "step": 6970 + }, + { + "ce_loss": 0.3108407258987427, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.4415343701839447, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.20662188529968262, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 1.3969581127166748, + "step": 6970 + }, + { + "ce_loss": 0.2721030116081238, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.45736241340637207, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.21603234112262726, + "step": 6970 + }, + { + "epoch": 2.3282188125416945, + "loss": 1.1454, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "grad_norm": 1.446701169013977, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "learning_rate": 0.0007167704937573071, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 1.009114384651184, + "step": 6980 + }, + { + "ce_loss": 0.3090960681438446, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.4400170147418976, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.25981760025024414, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.9236708283424377, + "step": 6980 + }, + { + "ce_loss": 0.2558498680591583, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.417130708694458, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.17787563800811768, + "step": 6980 + }, + { + "epoch": 2.33155436957972, + "loss": 1.1306, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "grad_norm": 2.063185930252075, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "learning_rate": 0.0007165064424969377, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 1.2225693464279175, + "step": 6990 + }, + { + "ce_loss": 0.40387097001075745, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.4863589406013489, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.21981634199619293, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 1.1351462602615356, + "step": 6990 + }, + { + "ce_loss": 0.3387243151664734, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.4375489354133606, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.20835743844509125, + "step": 6990 + }, + { + "epoch": 2.334889926617745, + "loss": 1.0529, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "grad_norm": 2.4738194942474365, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "learning_rate": 0.0007162420218394925, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 1.1578466892242432, + "step": 7000 + }, + { + "ce_loss": 0.22912459075450897, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.41512879729270935, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.1929943561553955, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 1.3507180213928223, + "step": 7000 + }, + { + "ce_loss": 0.5369671583175659, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.45025479793548584, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.3083897531032562, + "step": 7000 + }, + { + "epoch": 2.3382254836557705, + "loss": 1.0127, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "grad_norm": 1.8202849626541138, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "learning_rate": 0.0007159772320935789, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.9785995483398438, + "step": 7010 + }, + { + "ce_loss": 0.2975868582725525, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.386867880821228, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.20957233011722565, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 1.2656875848770142, + "step": 7010 + }, + { + "ce_loss": 0.26873648166656494, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.35235121846199036, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.17390835285186768, + "step": 7010 + }, + { + "epoch": 2.341561040693796, + "loss": 1.1443, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "grad_norm": 1.9370781183242798, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "learning_rate": 0.0007157120735682347, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 1.189287781715393, + "step": 7020 + }, + { + "ce_loss": 0.3431004583835602, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.3850644528865814, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.2174300104379654, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 0.8547623157501221, + "step": 7020 + }, + { + "ce_loss": 0.23848767578601837, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.37629422545433044, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.1636846661567688, + "step": 7020 + }, + { + "epoch": 2.3448965977318212, + "loss": 1.0868, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "grad_norm": 2.332381010055542, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "learning_rate": 0.0007154465465729286, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.9023130536079407, + "step": 7030 + }, + { + "ce_loss": 0.21761271357536316, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.4426616132259369, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.18290437757968903, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 1.361938238143921, + "step": 7030 + }, + { + "ce_loss": 0.2959597706794739, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.4138369858264923, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.19288687407970428, + "step": 7030 + }, + { + "epoch": 2.3482321547698466, + "loss": 1.1662, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "grad_norm": 2.0527827739715576, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "learning_rate": 0.0007151806514175594, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 1.0762706995010376, + "step": 7040 + }, + { + "ce_loss": 0.3061358332633972, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.4743525981903076, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.2185249924659729, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 1.901619791984558, + "step": 7040 + }, + { + "ce_loss": 0.3605675995349884, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.4413268566131592, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.22994102537631989, + "step": 7040 + }, + { + "epoch": 2.351567711807872, + "loss": 1.0331, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "grad_norm": 1.784109354019165, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "learning_rate": 0.0007149143884124551, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 1.3922317028045654, + "step": 7050 + }, + { + "ce_loss": 0.26569482684135437, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.39552581310272217, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.22670617699623108, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 1.1182749271392822, + "step": 7050 + }, + { + "ce_loss": 0.31295108795166016, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.4126480221748352, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.20783144235610962, + "step": 7050 + }, + { + "epoch": 2.3549032688458973, + "loss": 1.1147, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "grad_norm": 2.4703519344329834, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "learning_rate": 0.0007146477578683731, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 1.0593907833099365, + "step": 7060 + }, + { + "ce_loss": 0.2643020749092102, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.3918103575706482, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.20907360315322876, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.9865421056747437, + "step": 7060 + }, + { + "ce_loss": 0.2925555408000946, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.47285857796669006, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.2207275629043579, + "step": 7060 + }, + { + "epoch": 2.3582388258839226, + "loss": 1.0843, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "grad_norm": 2.159562110900879, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "learning_rate": 0.0007143807600965004, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 1.2472641468048096, + "step": 7070 + }, + { + "ce_loss": 0.2805372178554535, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.46946394443511963, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.2066594958305359, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 1.0199264287948608, + "step": 7070 + }, + { + "ce_loss": 0.3058395981788635, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.4050654470920563, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.2362363338470459, + "step": 7070 + }, + { + "epoch": 2.361574382921948, + "loss": 1.175, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "grad_norm": 4.379484176635742, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "learning_rate": 0.0007141133954084518, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 1.5523792505264282, + "step": 7080 + }, + { + "ce_loss": 0.3695584237575531, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.6727487444877625, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.25676894187927246, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 1.258602499961853, + "step": 7080 + }, + { + "ce_loss": 0.3308517336845398, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.5026138424873352, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.2165544331073761, + "step": 7080 + }, + { + "epoch": 2.3649099399599733, + "loss": 1.1915, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "grad_norm": 1.4523996114730835, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "learning_rate": 0.0007138456641162708, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 0.9873166084289551, + "step": 7090 + }, + { + "ce_loss": 0.2942551076412201, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.4172300696372986, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.23000051081180573, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 1.6088917255401611, + "step": 7090 + }, + { + "ce_loss": 0.2873069941997528, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.46405521035194397, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.21548372507095337, + "step": 7090 + }, + { + "epoch": 2.3682454969979987, + "loss": 1.0817, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "grad_norm": 1.5708006620407104, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "learning_rate": 0.0007135775665324286, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 1.3646875619888306, + "step": 7100 + }, + { + "ce_loss": 0.29803529381752014, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.47713354229927063, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.24027717113494873, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 1.215323567390442, + "step": 7100 + }, + { + "ce_loss": 0.31460314989089966, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.46045100688934326, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.17487739026546478, + "step": 7100 + }, + { + "epoch": 2.371581054036024, + "loss": 1.0933, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "grad_norm": 1.793230652809143, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "learning_rate": 0.0007133091029698239, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 1.1998347043991089, + "step": 7110 + }, + { + "ce_loss": 0.3668796420097351, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.5767227411270142, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.18673135340213776, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 1.4413803815841675, + "step": 7110 + }, + { + "ce_loss": 0.28860530257225037, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.5573756694793701, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.1992310881614685, + "step": 7110 + }, + { + "epoch": 2.3749166110740494, + "loss": 1.1161, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "grad_norm": 1.769562840461731, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "learning_rate": 0.0007130402737417825, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.9815624356269836, + "step": 7120 + }, + { + "ce_loss": 0.24128681421279907, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.5173496007919312, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.22285443544387817, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.7874520421028137, + "step": 7120 + }, + { + "ce_loss": 0.24674908816814423, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.3168177902698517, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.22377490997314453, + "step": 7120 + }, + { + "epoch": 2.3782521681120747, + "loss": 1.0619, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "grad_norm": 1.6740084886550903, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "learning_rate": 0.0007127710791620573, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 1.0077743530273438, + "step": 7130 + }, + { + "ce_loss": 0.34261929988861084, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.39329415559768677, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.23144038021564484, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.9109863638877869, + "step": 7130 + }, + { + "ce_loss": 0.2927761375904083, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.4364514648914337, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.1817220151424408, + "step": 7130 + }, + { + "epoch": 2.3815877251501, + "loss": 1.1178, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "grad_norm": 3.0695078372955322, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "learning_rate": 0.000712501519544827, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 1.096538782119751, + "step": 7140 + }, + { + "ce_loss": 0.2581261098384857, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.4424154758453369, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.2355940043926239, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 0.9535109996795654, + "step": 7140 + }, + { + "ce_loss": 0.2059747874736786, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.39877620339393616, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.2136932760477066, + "step": 7140 + }, + { + "epoch": 2.3849232821881254, + "loss": 1.0672, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "grad_norm": 2.157912015914917, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "learning_rate": 0.000712231595204697, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 0.9159623384475708, + "step": 7150 + }, + { + "ce_loss": 0.2085161805152893, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.43406280875205994, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.21358418464660645, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 1.272546410560608, + "step": 7150 + }, + { + "ce_loss": 0.3274426758289337, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.47474992275238037, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.23619666695594788, + "step": 7150 + }, + { + "epoch": 2.388258839226151, + "loss": 1.1162, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "grad_norm": 1.9163835048675537, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "learning_rate": 0.0007119613064566976, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 1.100827693939209, + "step": 7160 + }, + { + "ce_loss": 0.31992077827453613, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.5282621383666992, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.20679350197315216, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 1.132440209388733, + "step": 7160 + }, + { + "ce_loss": 0.2905762195587158, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.4590890109539032, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.24715933203697205, + "step": 7160 + }, + { + "epoch": 2.391594396264176, + "loss": 1.0898, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "grad_norm": 2.1200919151306152, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "learning_rate": 0.0007116906536162853, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 1.1722006797790527, + "step": 7170 + }, + { + "ce_loss": 0.2898659408092499, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.5165716409683228, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.2042417675256729, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 0.8611701726913452, + "step": 7170 + }, + { + "ce_loss": 0.26445263624191284, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.4190884828567505, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.17744985222816467, + "step": 7170 + }, + { + "epoch": 2.3949299533022015, + "loss": 1.1236, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "grad_norm": 2.103191375732422, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "learning_rate": 0.000711419636999341, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 1.374045491218567, + "step": 7180 + }, + { + "ce_loss": 0.28513509035110474, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.5116044878959656, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.1862870454788208, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.8769741058349609, + "step": 7180 + }, + { + "ce_loss": 0.21405696868896484, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.4389774203300476, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.17624899744987488, + "step": 7180 + }, + { + "epoch": 2.398265510340227, + "loss": 1.1529, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "grad_norm": 1.781639575958252, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "learning_rate": 0.0007111482569221702, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 1.2831974029541016, + "step": 7190 + }, + { + "ce_loss": 0.24365730583667755, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.5405185222625732, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.18862995505332947, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 1.2575526237487793, + "step": 7190 + }, + { + "ce_loss": 0.30534759163856506, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.4284963309764862, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.22596989572048187, + "step": 7190 + }, + { + "epoch": 2.401601067378252, + "loss": 1.1763, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "grad_norm": 2.416120767593384, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "learning_rate": 0.0007108765137015025, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 1.0382710695266724, + "step": 7200 + }, + { + "ce_loss": 0.2267204225063324, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.37804681062698364, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.2037581354379654, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 1.2725032567977905, + "step": 7200 + }, + { + "ce_loss": 0.3279874622821808, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.44648557901382446, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.23695345222949982, + "step": 7200 + }, + { + "epoch": 2.4049366244162775, + "loss": 1.0089, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "grad_norm": 1.7188997268676758, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "learning_rate": 0.0007106044076544916, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 1.175480604171753, + "step": 7210 + }, + { + "ce_loss": 0.3662952482700348, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.410007119178772, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.32809457182884216, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 1.060041069984436, + "step": 7210 + }, + { + "ce_loss": 0.26448532938957214, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.39402514696121216, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.16908228397369385, + "step": 7210 + }, + { + "epoch": 2.408272181454303, + "loss": 1.0906, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "grad_norm": 2.511235237121582, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "learning_rate": 0.0007103319390987146, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 0.8862078785896301, + "step": 7220 + }, + { + "ce_loss": 0.2618538737297058, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.38268762826919556, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.24153800308704376, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 1.3634955883026123, + "step": 7220 + }, + { + "ce_loss": 0.27552342414855957, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.48205381631851196, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.1887548267841339, + "step": 7220 + }, + { + "epoch": 2.4116077384923282, + "loss": 1.1096, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "grad_norm": 1.6549991369247437, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "learning_rate": 0.0007100591083521716, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 1.0235944986343384, + "step": 7230 + }, + { + "ce_loss": 0.290785551071167, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.3853553533554077, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.2579878270626068, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 1.4111783504486084, + "step": 7230 + }, + { + "ce_loss": 0.30856239795684814, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.3862026631832123, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.17828388512134552, + "step": 7230 + }, + { + "epoch": 2.4149432955303536, + "loss": 1.156, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "grad_norm": 1.7623802423477173, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "learning_rate": 0.0007097859157332854, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 1.2994335889816284, + "step": 7240 + }, + { + "ce_loss": 0.2809923589229584, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.4485905170440674, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.23245123028755188, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 1.281017541885376, + "step": 7240 + }, + { + "ce_loss": 0.2583031952381134, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.48194581270217896, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.19480441510677338, + "step": 7240 + }, + { + "epoch": 2.418278852568379, + "loss": 1.2047, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "grad_norm": 1.3845771551132202, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "learning_rate": 0.0007095123615609013, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 1.400426983833313, + "step": 7250 + }, + { + "ce_loss": 0.41670477390289307, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.4637930393218994, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.282934308052063, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 1.1753677129745483, + "step": 7250 + }, + { + "ce_loss": 0.2805415689945221, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.5144063234329224, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.2224591225385666, + "step": 7250 + }, + { + "epoch": 2.4216144096064043, + "loss": 1.1877, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "grad_norm": 1.6489319801330566, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "learning_rate": 0.0007092384461542862, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.8170281648635864, + "step": 7260 + }, + { + "ce_loss": 0.1991329789161682, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.4296095371246338, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.1308637410402298, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 1.1548151969909668, + "step": 7260 + }, + { + "ce_loss": 0.2822519540786743, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.4908180236816406, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.23409049212932587, + "step": 7260 + }, + { + "epoch": 2.4249499666444296, + "loss": 1.1386, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "grad_norm": 3.451852798461914, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "learning_rate": 0.0007089641698331291, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 1.0679713487625122, + "step": 7270 + }, + { + "ce_loss": 0.31112760305404663, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.5061574578285217, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.25037115812301636, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.9560131430625916, + "step": 7270 + }, + { + "ce_loss": 0.23722390830516815, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.4231836795806885, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.17950914800167084, + "step": 7270 + }, + { + "epoch": 2.428285523682455, + "loss": 1.0816, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "grad_norm": 1.921055555343628, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "learning_rate": 0.0007086895329175397, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 1.4632203578948975, + "step": 7280 + }, + { + "ce_loss": 0.3397609293460846, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.4787476062774658, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.25433018803596497, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 1.5840129852294922, + "step": 7280 + }, + { + "ce_loss": 0.22069182991981506, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.4394543170928955, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.18539944291114807, + "step": 7280 + }, + { + "epoch": 2.4316210807204803, + "loss": 1.157, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "grad_norm": 2.349575996398926, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "learning_rate": 0.0007084145357280491, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.7957502603530884, + "step": 7290 + }, + { + "ce_loss": 0.2420455366373062, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.2949468493461609, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.2086876779794693, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 1.535692572593689, + "step": 7290 + }, + { + "ce_loss": 0.2967432141304016, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.4691455066204071, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.25700587034225464, + "step": 7290 + }, + { + "epoch": 2.4349566377585057, + "loss": 1.0614, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "grad_norm": 2.760479688644409, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "learning_rate": 0.0007081391785856086, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 1.2903001308441162, + "step": 7300 + }, + { + "ce_loss": 0.338364839553833, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.4312809109687805, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.2753032445907593, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 1.3935198783874512, + "step": 7300 + }, + { + "ce_loss": 0.26645249128341675, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.4124131202697754, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.2165878266096115, + "step": 7300 + }, + { + "epoch": 2.438292194796531, + "loss": 1.0521, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "grad_norm": 2.61460542678833, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "learning_rate": 0.0007078634618115896, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 1.2837860584259033, + "step": 7310 + }, + { + "ce_loss": 0.34777307510375977, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.5416009426116943, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.24954359233379364, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 1.1179468631744385, + "step": 7310 + }, + { + "ce_loss": 0.2547168731689453, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.4858138859272003, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.1752818375825882, + "step": 7310 + }, + { + "epoch": 2.4416277518345564, + "loss": 1.0628, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "grad_norm": 2.5039851665496826, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "learning_rate": 0.0007075873857277831, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 1.1534297466278076, + "step": 7320 + }, + { + "ce_loss": 0.32291606068611145, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.428676962852478, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.21789410710334778, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.9658570885658264, + "step": 7320 + }, + { + "ce_loss": 0.29541534185409546, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.37291932106018066, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.17409594357013702, + "step": 7320 + }, + { + "epoch": 2.4449633088725817, + "loss": 1.0498, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "grad_norm": 1.6878045797348022, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "learning_rate": 0.0007073109506563997, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 1.2049627304077148, + "step": 7330 + }, + { + "ce_loss": 0.29368162155151367, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.43546533584594727, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.2480524331331253, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 1.1017310619354248, + "step": 7330 + }, + { + "ce_loss": 0.3388078212738037, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.465187132358551, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.24967536330223083, + "step": 7330 + }, + { + "epoch": 2.448298865910607, + "loss": 1.0671, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "grad_norm": 2.228999614715576, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "learning_rate": 0.0007070341569200688, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 1.1676300764083862, + "step": 7340 + }, + { + "ce_loss": 0.27120673656463623, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.5025864243507385, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.17142629623413086, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 1.1279618740081787, + "step": 7340 + }, + { + "ce_loss": 0.25630664825439453, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.5078726410865784, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.19859783351421356, + "step": 7340 + }, + { + "epoch": 2.4516344229486324, + "loss": 1.1206, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "grad_norm": 2.9473607540130615, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "learning_rate": 0.0007067570048418387, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 1.2834032773971558, + "step": 7350 + }, + { + "ce_loss": 0.33302411437034607, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.543437123298645, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.24555639922618866, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 1.115805983543396, + "step": 7350 + }, + { + "ce_loss": 0.26689279079437256, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.4715449810028076, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.18562690913677216, + "step": 7350 + }, + { + "epoch": 2.454969979986658, + "loss": 1.0991, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "grad_norm": 1.9149889945983887, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "learning_rate": 0.0007064794947451753, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 1.2051515579223633, + "step": 7360 + }, + { + "ce_loss": 0.3245408535003662, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.5593629479408264, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.2596641480922699, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 1.3105435371398926, + "step": 7360 + }, + { + "ce_loss": 0.32686343789100647, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.5083961486816406, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.2100948542356491, + "step": 7360 + }, + { + "epoch": 2.458305537024683, + "loss": 1.0501, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "grad_norm": 2.0883407592773438, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "learning_rate": 0.0007062016269539631, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 0.8910499811172485, + "step": 7370 + }, + { + "ce_loss": 0.27967867255210876, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.45400354266166687, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.12001747637987137, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 1.0135114192962646, + "step": 7370 + }, + { + "ce_loss": 0.1810399889945984, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.3433760702610016, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.2140178084373474, + "step": 7370 + }, + { + "epoch": 2.4616410940627085, + "loss": 1.0993, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "grad_norm": 1.7108585834503174, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "learning_rate": 0.0007059234017925036, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 0.7910228371620178, + "step": 7380 + }, + { + "ce_loss": 0.23623459041118622, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.2763459384441376, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.21518924832344055, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 1.0838594436645508, + "step": 7380 + }, + { + "ce_loss": 0.20617178082466125, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.3263876140117645, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.11844362318515778, + "step": 7380 + }, + { + "epoch": 2.464976651100734, + "loss": 1.0635, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "grad_norm": 2.067502975463867, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "learning_rate": 0.0007056448195855154, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 1.2652387619018555, + "step": 7390 + }, + { + "ce_loss": 0.311957985162735, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.4279102087020874, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.25051218271255493, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 1.2960461378097534, + "step": 7390 + }, + { + "ce_loss": 0.3852989673614502, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.46075063943862915, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.25648966431617737, + "step": 7390 + }, + { + "epoch": 2.468312208138759, + "loss": 1.0858, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "grad_norm": 2.067422866821289, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "learning_rate": 0.0007053658806581341, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 0.9738046526908875, + "step": 7400 + }, + { + "ce_loss": 0.3073391020298004, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.46066734194755554, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.15834133327007294, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 1.1532341241836548, + "step": 7400 + }, + { + "ce_loss": 0.2673509418964386, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.5188738703727722, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.21087826788425446, + "step": 7400 + }, + { + "epoch": 2.4716477651767845, + "loss": 1.1162, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "grad_norm": 1.6830294132232666, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "learning_rate": 0.0007050865853359113, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.8965054154396057, + "step": 7410 + }, + { + "ce_loss": 0.27338090538978577, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.35894811153411865, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.2638855278491974, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.9959196448326111, + "step": 7410 + }, + { + "ce_loss": 0.29337841272354126, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.4339549243450165, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.21029168367385864, + "step": 7410 + }, + { + "epoch": 2.47498332221481, + "loss": 1.0628, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "grad_norm": 1.4016879796981812, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "learning_rate": 0.0007048069339448147, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 1.0335075855255127, + "step": 7420 + }, + { + "ce_loss": 0.31529316306114197, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.4157218933105469, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.24515531957149506, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 1.2172638177871704, + "step": 7420 + }, + { + "ce_loss": 0.308971643447876, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.4950236678123474, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.24021197855472565, + "step": 7420 + }, + { + "epoch": 2.4783188792528352, + "loss": 1.0767, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "grad_norm": 1.9759247303009033, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "learning_rate": 0.0007045269268112277, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 1.916853427886963, + "step": 7430 + }, + { + "ce_loss": 0.2808642089366913, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.3968324661254883, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.23114171624183655, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 1.0604915618896484, + "step": 7430 + }, + { + "ce_loss": 0.2519376575946808, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.3647528886795044, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.19655263423919678, + "step": 7430 + }, + { + "epoch": 2.4816544362908606, + "loss": 1.0813, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "grad_norm": 1.7491250038146973, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "learning_rate": 0.0007042465642619485, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 1.091002345085144, + "step": 7440 + }, + { + "ce_loss": 0.3512522578239441, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.52333003282547, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.21626801788806915, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.9965525269508362, + "step": 7440 + }, + { + "ce_loss": 0.24571192264556885, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.44148796796798706, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.2422831505537033, + "step": 7440 + }, + { + "epoch": 2.484989993328886, + "loss": 1.0955, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "grad_norm": 1.8003133535385132, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "learning_rate": 0.0007039658466241906, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 1.0173012018203735, + "step": 7450 + }, + { + "ce_loss": 0.26152217388153076, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.5212584733963013, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.2341955155134201, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 1.2713087797164917, + "step": 7450 + }, + { + "ce_loss": 0.2889062762260437, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.4098033607006073, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.26270008087158203, + "step": 7450 + }, + { + "epoch": 2.4883255503669113, + "loss": 1.0681, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "grad_norm": 2.4886579513549805, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "learning_rate": 0.0007036847742255818, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 1.3388160467147827, + "step": 7460 + }, + { + "ce_loss": 0.333164781332016, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.5271449089050293, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.1901921182870865, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.8606884479522705, + "step": 7460 + }, + { + "ce_loss": 0.27313685417175293, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.414323627948761, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.17308597266674042, + "step": 7460 + }, + { + "epoch": 2.4916611074049366, + "loss": 1.1727, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "grad_norm": 1.5319260358810425, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "learning_rate": 0.0007034033473941634, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.8533339500427246, + "step": 7470 + }, + { + "ce_loss": 0.1954154670238495, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.42147818207740784, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.16750460863113403, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.9842551946640015, + "step": 7470 + }, + { + "ce_loss": 0.2575088143348694, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.4689650535583496, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.2151673138141632, + "step": 7470 + }, + { + "epoch": 2.494996664442962, + "loss": 1.1905, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "grad_norm": 2.0598387718200684, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "learning_rate": 0.0007031215664583912, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 1.0834827423095703, + "step": 7480 + }, + { + "ce_loss": 0.2805913984775543, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.5459102392196655, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.18166399002075195, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 0.8282067775726318, + "step": 7480 + }, + { + "ce_loss": 0.18217626214027405, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.3959864377975464, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.18068751692771912, + "step": 7480 + }, + { + "epoch": 2.4983322214809873, + "loss": 1.1075, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "grad_norm": 2.6838510036468506, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "learning_rate": 0.0007028394317471335, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.9380246996879578, + "step": 7490 + }, + { + "ce_loss": 0.268350750207901, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.4618475139141083, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.16416999697685242, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 1.450337529182434, + "step": 7490 + }, + { + "ce_loss": 0.3254244029521942, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.6119667887687683, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.226872518658638, + "step": 7490 + }, + { + "epoch": 2.5016677785190127, + "loss": 1.0566, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "grad_norm": 1.2880253791809082, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "learning_rate": 0.0007025569435896722, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 1.4975539445877075, + "step": 7500 + }, + { + "ce_loss": 0.31754326820373535, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.48850399255752563, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.26213768124580383, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.9692498445510864, + "step": 7500 + }, + { + "ce_loss": 0.2891314625740051, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.46991127729415894, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.2100847214460373, + "step": 7500 + }, + { + "epoch": 2.505003335557038, + "loss": 1.0924, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "grad_norm": 1.5961883068084717, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "learning_rate": 0.0007022741023157013, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 1.0401496887207031, + "step": 7510 + }, + { + "ce_loss": 0.2791939675807953, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.3842151165008545, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.21842031180858612, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 1.0297224521636963, + "step": 7510 + }, + { + "ce_loss": 0.2569929361343384, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.39038562774658203, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.22410175204277039, + "step": 7510 + }, + { + "epoch": 2.5083388925950634, + "loss": 1.0764, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "grad_norm": 2.00757098197937, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "learning_rate": 0.000701990908255327, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 1.0620434284210205, + "step": 7520 + }, + { + "ce_loss": 0.22661221027374268, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.48355352878570557, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 0.22372646629810333, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 1.1504652500152588, + "step": 7520 + }, + { + "ce_loss": 0.3021428883075714, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.4501044750213623, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 0.22373048961162567, + "step": 7520 + }, + { + "epoch": 2.5116744496330887, + "loss": 1.1038, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "grad_norm": 2.8594300746917725, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "learning_rate": 0.0007017073617390671, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 1.1574616432189941, + "step": 7530 + }, + { + "ce_loss": 0.26310059428215027, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.5132943987846375, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.18846635520458221, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 1.2592483758926392, + "step": 7530 + }, + { + "ce_loss": 0.2359173595905304, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.4269721806049347, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.21566621959209442, + "step": 7530 + }, + { + "epoch": 2.515010006671114, + "loss": 1.1568, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "grad_norm": 3.1892199516296387, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "learning_rate": 0.000701423463097851, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 1.0318684577941895, + "step": 7540 + }, + { + "ce_loss": 0.24232074618339539, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.4724856913089752, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.19329185783863068, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 1.1266546249389648, + "step": 7540 + }, + { + "ce_loss": 0.3473910987377167, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.4599970877170563, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.1923563927412033, + "step": 7540 + }, + { + "epoch": 2.5183455637091394, + "loss": 1.1127, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "grad_norm": 1.9296170473098755, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "learning_rate": 0.000701139212663019, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.8258950114250183, + "step": 7550 + }, + { + "ce_loss": 0.23694711923599243, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.41021570563316345, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.17734235525131226, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 1.1382707357406616, + "step": 7550 + }, + { + "ce_loss": 0.32674920558929443, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.4254034161567688, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.2471553236246109, + "step": 7550 + }, + { + "epoch": 2.5216811207471648, + "loss": 1.0207, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "grad_norm": 1.584105372428894, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "learning_rate": 0.0007008546107663218, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.9069346785545349, + "step": 7560 + }, + { + "ce_loss": 0.20985406637191772, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.3904266953468323, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.18962639570236206, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.9962298274040222, + "step": 7560 + }, + { + "ce_loss": 0.2645713984966278, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.46430978178977966, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.21918721497058868, + "step": 7560 + }, + { + "epoch": 2.52501667778519, + "loss": 1.0951, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "grad_norm": 2.894544839859009, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "learning_rate": 0.0007005696577399206, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 0.918897271156311, + "step": 7570 + }, + { + "ce_loss": 0.22210124135017395, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.4796445965766907, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.21641956269741058, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 1.289629578590393, + "step": 7570 + }, + { + "ce_loss": 0.32594767212867737, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.5843258500099182, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.2191724181175232, + "step": 7570 + }, + { + "epoch": 2.5283522348232155, + "loss": 1.0894, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "grad_norm": 2.0715115070343018, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "learning_rate": 0.0007002843539163862, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.9046424627304077, + "step": 7580 + }, + { + "ce_loss": 0.1960611343383789, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.4171680808067322, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.20283107459545135, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 1.094390630722046, + "step": 7580 + }, + { + "ce_loss": 0.3450239598751068, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.5300965309143066, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.21903318166732788, + "step": 7580 + }, + { + "epoch": 2.531687791861241, + "loss": 1.0695, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "grad_norm": 1.8430460691452026, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "learning_rate": 0.0006999986996286989, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.8136076927185059, + "step": 7590 + }, + { + "ce_loss": 0.1908169835805893, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.37328973412513733, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.17851682007312775, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.9584760069847107, + "step": 7590 + }, + { + "ce_loss": 0.2861538827419281, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.40880483388900757, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.20923297107219696, + "step": 7590 + }, + { + "epoch": 2.535023348899266, + "loss": 1.0368, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "grad_norm": 1.763928771018982, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "learning_rate": 0.0006997126952102479, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 1.0512466430664062, + "step": 7600 + }, + { + "ce_loss": 0.280617892742157, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.45977500081062317, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.2450403869152069, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 1.2413963079452515, + "step": 7600 + }, + { + "ce_loss": 0.3729633688926697, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.5388633012771606, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.26855915784835815, + "step": 7600 + }, + { + "epoch": 2.5383589059372915, + "loss": 1.1069, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "grad_norm": 1.5387122631072998, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "learning_rate": 0.0006994263409948312, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 1.035395622253418, + "step": 7610 + }, + { + "ce_loss": 0.2836330235004425, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.4233282208442688, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.20756769180297852, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 1.4395779371261597, + "step": 7610 + }, + { + "ce_loss": 0.32102641463279724, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.562681257724762, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.24834278225898743, + "step": 7610 + }, + { + "epoch": 2.541694462975317, + "loss": 1.1417, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "grad_norm": 1.9746698141098022, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "learning_rate": 0.0006991396373166548, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 1.1625916957855225, + "step": 7620 + }, + { + "ce_loss": 0.2999421954154968, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.42592406272888184, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.24482430517673492, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 0.8874329924583435, + "step": 7620 + }, + { + "ce_loss": 0.25415265560150146, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.3725735545158386, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.2000926285982132, + "step": 7620 + }, + { + "epoch": 2.545030020013342, + "loss": 1.0415, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "grad_norm": 1.7550071477890015, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "learning_rate": 0.0006988525845103331, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 1.1274958848953247, + "step": 7630 + }, + { + "ce_loss": 0.22237154841423035, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.47276023030281067, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.25588810443878174, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 1.2401340007781982, + "step": 7630 + }, + { + "ce_loss": 0.2866066098213196, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.4623689651489258, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.21370618045330048, + "step": 7630 + }, + { + "epoch": 2.5483655770513676, + "loss": 1.1032, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "grad_norm": 2.0811660289764404, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "learning_rate": 0.0006985651829108872, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 1.0950477123260498, + "step": 7640 + }, + { + "ce_loss": 0.27928969264030457, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.47992321848869324, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.2042643129825592, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.9194262027740479, + "step": 7640 + }, + { + "ce_loss": 0.2476658970117569, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.44567611813545227, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.2102728933095932, + "step": 7640 + }, + { + "epoch": 2.551701134089393, + "loss": 1.1073, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "grad_norm": 1.6015641689300537, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "learning_rate": 0.000698277432853746, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 0.8000747561454773, + "step": 7650 + }, + { + "ce_loss": 0.2432447075843811, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.35264867544174194, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.20382060110569, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 1.2146551609039307, + "step": 7650 + }, + { + "ce_loss": 0.3808715343475342, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.5072845220565796, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.2642064094543457, + "step": 7650 + }, + { + "epoch": 2.5550366911274183, + "loss": 1.0363, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "grad_norm": 1.6908466815948486, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "learning_rate": 0.0006979893346747447, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.9030312895774841, + "step": 7660 + }, + { + "ce_loss": 0.2747442424297333, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.3266064524650574, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.23646491765975952, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.9936516284942627, + "step": 7660 + }, + { + "ce_loss": 0.28957706689834595, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.3737667500972748, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.21820056438446045, + "step": 7660 + }, + { + "epoch": 2.5583722481654436, + "loss": 1.0447, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "grad_norm": 2.1870462894439697, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "learning_rate": 0.0006977008887101248, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 1.0413737297058105, + "step": 7670 + }, + { + "ce_loss": 0.2267252653837204, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.4250425100326538, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.1708996295928955, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 1.0603262186050415, + "step": 7670 + }, + { + "ce_loss": 0.21380625665187836, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.47527724504470825, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.16205209493637085, + "step": 7670 + }, + { + "epoch": 2.561707805203469, + "loss": 1.093, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "grad_norm": 1.7474945783615112, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "learning_rate": 0.000697412095296534, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 0.9549005627632141, + "step": 7680 + }, + { + "ce_loss": 0.2424485683441162, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.4325232207775116, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.2201852947473526, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 1.0097670555114746, + "step": 7680 + }, + { + "ce_loss": 0.269885778427124, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.40437909960746765, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.2088843733072281, + "step": 7680 + }, + { + "epoch": 2.5650433622414943, + "loss": 1.0571, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "grad_norm": 2.7915446758270264, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "learning_rate": 0.0006971229547710249, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 0.9924904108047485, + "step": 7690 + }, + { + "ce_loss": 0.2706010043621063, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.46180590987205505, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.18765035271644592, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 1.382493257522583, + "step": 7690 + }, + { + "ce_loss": 0.38130655884742737, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.38000595569610596, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.24802355468273163, + "step": 7690 + }, + { + "epoch": 2.5683789192795197, + "loss": 1.0481, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "grad_norm": 2.279045343399048, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "learning_rate": 0.000696833467471056, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 1.0659339427947998, + "step": 7700 + }, + { + "ce_loss": 0.2872184216976166, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.43056899309158325, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.2018534541130066, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 1.2552160024642944, + "step": 7700 + }, + { + "ce_loss": 0.33749714493751526, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.5229628682136536, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.2517550587654114, + "step": 7700 + }, + { + "epoch": 2.571714476317545, + "loss": 1.1328, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "grad_norm": 2.3299286365509033, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "learning_rate": 0.0006965436337344899, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 1.0527980327606201, + "step": 7710 + }, + { + "ce_loss": 0.27170753479003906, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.43882137537002563, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.2398902326822281, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 1.035448431968689, + "step": 7710 + }, + { + "ce_loss": 0.2794928252696991, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.4769744873046875, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.23001840710639954, + "step": 7710 + }, + { + "epoch": 2.5750500333555704, + "loss": 1.0902, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "grad_norm": 2.4687540531158447, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "learning_rate": 0.0006962534538995938, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 1.28634774684906, + "step": 7720 + }, + { + "ce_loss": 0.3173252046108246, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.49686121940612793, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.22386427223682404, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.9877050518989563, + "step": 7720 + }, + { + "ce_loss": 0.2048882693052292, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.43360722064971924, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.21809960901737213, + "step": 7720 + }, + { + "epoch": 2.5783855903935957, + "loss": 1.0722, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "grad_norm": 1.9223451614379883, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "learning_rate": 0.0006959629283050388, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.8922337889671326, + "step": 7730 + }, + { + "ce_loss": 0.25825366377830505, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.3473392426967621, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.22594837844371796, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.8970277309417725, + "step": 7730 + }, + { + "ce_loss": 0.2750245928764343, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.3688482642173767, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.20284493267536163, + "step": 7730 + }, + { + "epoch": 2.581721147431621, + "loss": 1.0514, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "grad_norm": 2.207745313644409, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "learning_rate": 0.0006956720572898995, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 1.0820971727371216, + "step": 7740 + }, + { + "ce_loss": 0.30136752128601074, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.34338176250457764, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.2206101268529892, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 0.9287523627281189, + "step": 7740 + }, + { + "ce_loss": 0.27432000637054443, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.3677484691143036, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.23579438030719757, + "step": 7740 + }, + { + "epoch": 2.5850567044696464, + "loss": 1.0598, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "grad_norm": 1.980271577835083, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "learning_rate": 0.0006953808411936538, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 1.5159064531326294, + "step": 7750 + }, + { + "ce_loss": 0.31128838658332825, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.5279540419578552, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.24524328112602234, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 1.2730480432510376, + "step": 7750 + }, + { + "ce_loss": 0.2881583869457245, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.4585064649581909, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.20309162139892578, + "step": 7750 + }, + { + "epoch": 2.5883922615076718, + "loss": 1.1971, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "grad_norm": 1.7246359586715698, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "learning_rate": 0.0006950892803561821, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 1.2225000858306885, + "step": 7760 + }, + { + "ce_loss": 0.2808440029621124, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.4612267017364502, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.20239432156085968, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.9158413410186768, + "step": 7760 + }, + { + "ce_loss": 0.26169925928115845, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.42524948716163635, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.16362762451171875, + "step": 7760 + }, + { + "epoch": 2.591727818545697, + "loss": 1.1191, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "grad_norm": 4.186878204345703, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "learning_rate": 0.0006947973751177674, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.8495794534683228, + "step": 7770 + }, + { + "ce_loss": 0.23491553962230682, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.39159250259399414, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.22222238779067993, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.8863764405250549, + "step": 7770 + }, + { + "ce_loss": 0.24199603497982025, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.461648166179657, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.18189716339111328, + "step": 7770 + }, + { + "epoch": 2.5950633755837225, + "loss": 1.053, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "grad_norm": 2.2770771980285645, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "learning_rate": 0.0006945051258190942, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 1.204896092414856, + "step": 7780 + }, + { + "ce_loss": 0.23836398124694824, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.36417123675346375, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.23337742686271667, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.9068416953086853, + "step": 7780 + }, + { + "ce_loss": 0.2626253664493561, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.3526120185852051, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.19078634679317474, + "step": 7780 + }, + { + "epoch": 2.598398932621748, + "loss": 1.066, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "grad_norm": 1.2925469875335693, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "learning_rate": 0.0006942125328012493, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 1.0056893825531006, + "step": 7790 + }, + { + "ce_loss": 0.2945784032344818, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.47337833046913147, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.1684665083885193, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 1.032659888267517, + "step": 7790 + }, + { + "ce_loss": 0.3183746635913849, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.4784265458583832, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.192928284406662, + "step": 7790 + }, + { + "epoch": 2.601734489659773, + "loss": 1.1247, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "grad_norm": 2.7289175987243652, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "learning_rate": 0.0006939195964057199, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 1.3446162939071655, + "step": 7800 + }, + { + "ce_loss": 0.31287020444869995, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.43550536036491394, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.1966378092765808, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 1.102492332458496, + "step": 7800 + }, + { + "ce_loss": 0.295215368270874, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.4144419729709625, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.22681808471679688, + "step": 7800 + }, + { + "epoch": 2.6050700466977985, + "loss": 1.0734, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "grad_norm": 1.937639594078064, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "learning_rate": 0.0006936263169743946, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 1.1655282974243164, + "step": 7810 + }, + { + "ce_loss": 0.3544881343841553, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.5030702352523804, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.26013660430908203, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.8801066875457764, + "step": 7810 + }, + { + "ce_loss": 0.24790768325328827, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.42721542716026306, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.1820715367794037, + "step": 7810 + }, + { + "epoch": 2.608405603735824, + "loss": 1.0556, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "grad_norm": 2.0428402423858643, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "learning_rate": 0.0006933326948495617, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 1.201572060585022, + "step": 7820 + }, + { + "ce_loss": 0.2293986827135086, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.44717761874198914, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.21371591091156006, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 0.9682687520980835, + "step": 7820 + }, + { + "ce_loss": 0.3088095784187317, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.38450387120246887, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.21013160049915314, + "step": 7820 + }, + { + "epoch": 2.611741160773849, + "loss": 1.0846, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "grad_norm": 2.4795031547546387, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "learning_rate": 0.0006930387303739101, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.9328702688217163, + "step": 7830 + }, + { + "ce_loss": 0.29024484753608704, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.3580683171749115, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.2304927408695221, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.9353972673416138, + "step": 7830 + }, + { + "ce_loss": 0.23102283477783203, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.4216915965080261, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.21496886014938354, + "step": 7830 + }, + { + "epoch": 2.6150767178118746, + "loss": 1.022, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "grad_norm": 2.2489166259765625, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "learning_rate": 0.000692744423890528, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 1.247374176979065, + "step": 7840 + }, + { + "ce_loss": 0.350018173456192, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.5923084616661072, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.24629969894886017, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.8726778626441956, + "step": 7840 + }, + { + "ce_loss": 0.18625923991203308, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.46877485513687134, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.21734049916267395, + "step": 7840 + }, + { + "epoch": 2.6184122748499, + "loss": 1.1391, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "grad_norm": 2.0969858169555664, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "learning_rate": 0.0006924497757429026, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 0.7978283762931824, + "step": 7850 + }, + { + "ce_loss": 0.21526743471622467, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.3887978792190552, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.19217370450496674, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 1.0249779224395752, + "step": 7850 + }, + { + "ce_loss": 0.26744702458381653, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.3994236886501312, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.2308528572320938, + "step": 7850 + }, + { + "epoch": 2.6217478318879253, + "loss": 1.0029, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "grad_norm": 1.8193413019180298, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "learning_rate": 0.0006921547862749198, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.8859058022499084, + "step": 7860 + }, + { + "ce_loss": 0.24655620753765106, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.398909330368042, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.18014656007289886, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 1.1356027126312256, + "step": 7860 + }, + { + "ce_loss": 0.36278200149536133, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.48611417412757874, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.2426128387451172, + "step": 7860 + }, + { + "epoch": 2.6250833889259506, + "loss": 1.0975, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "grad_norm": 2.170436143875122, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "learning_rate": 0.0006918594558308643, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.9275552034378052, + "step": 7870 + }, + { + "ce_loss": 0.2591184377670288, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.38452085852622986, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.1751992255449295, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.9300224184989929, + "step": 7870 + }, + { + "ce_loss": 0.26695799827575684, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.41706058382987976, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.20200999081134796, + "step": 7870 + }, + { + "epoch": 2.628418945963976, + "loss": 1.2211, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "grad_norm": 2.2208914756774902, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "learning_rate": 0.0006915637847554186, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 1.0086395740509033, + "step": 7880 + }, + { + "ce_loss": 0.3185664713382721, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.4550648331642151, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.18952278792858124, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 0.9416465759277344, + "step": 7880 + }, + { + "ce_loss": 0.2340216040611267, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.46229973435401917, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.16806453466415405, + "step": 7880 + }, + { + "epoch": 2.6317545030020013, + "loss": 1.1962, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "grad_norm": 2.458528518676758, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "learning_rate": 0.0006912677733936626, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 1.2581466436386108, + "step": 7890 + }, + { + "ce_loss": 0.3371962904930115, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.49287542700767517, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.2400340884923935, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.9736883044242859, + "step": 7890 + }, + { + "ce_loss": 0.26119476556777954, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.39335036277770996, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.229218989610672, + "step": 7890 + }, + { + "epoch": 2.6350900600400267, + "loss": 1.1243, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "grad_norm": 1.9649882316589355, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "learning_rate": 0.0006909714220910731, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 1.0200287103652954, + "step": 7900 + }, + { + "ce_loss": 0.3254638612270355, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.41585683822631836, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.20375323295593262, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 1.4603826999664307, + "step": 7900 + }, + { + "ce_loss": 0.33990997076034546, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.3670250177383423, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.2939133048057556, + "step": 7900 + }, + { + "epoch": 2.638425617078052, + "loss": 1.062, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "grad_norm": 3.770718574523926, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "learning_rate": 0.0006906747311935243, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.9048759341239929, + "step": 7910 + }, + { + "ce_loss": 0.21760985255241394, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.43006569147109985, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.1983773112297058, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.9832034707069397, + "step": 7910 + }, + { + "ce_loss": 0.2726140022277832, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.4162827432155609, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.21636536717414856, + "step": 7910 + }, + { + "epoch": 2.6417611741160774, + "loss": 1.095, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "grad_norm": 2.420430898666382, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "learning_rate": 0.0006903777010472864, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 1.0512824058532715, + "step": 7920 + }, + { + "ce_loss": 0.3101441264152527, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.4893198311328888, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.20389099419116974, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 1.038252592086792, + "step": 7920 + }, + { + "ce_loss": 0.20743471384048462, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.43021488189697266, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.21906831860542297, + "step": 7920 + }, + { + "epoch": 2.6450967311541027, + "loss": 1.1114, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "grad_norm": 1.645699143409729, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "learning_rate": 0.0006900803319990253, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.9869709610939026, + "step": 7930 + }, + { + "ce_loss": 0.274577796459198, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.408801406621933, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.26362085342407227, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 1.5112707614898682, + "step": 7930 + }, + { + "ce_loss": 0.34168630838394165, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.6311824321746826, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.22010086476802826, + "step": 7930 + }, + { + "epoch": 2.648432288192128, + "loss": 1.1468, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "grad_norm": 2.092884063720703, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "learning_rate": 0.000689782624395803, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 1.519085168838501, + "step": 7940 + }, + { + "ce_loss": 0.3875352144241333, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.4946945607662201, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.25518402457237244, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 1.0586838722229004, + "step": 7940 + }, + { + "ce_loss": 0.3155548572540283, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.43945226073265076, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.22807960212230682, + "step": 7940 + }, + { + "epoch": 2.6517678452301534, + "loss": 1.1394, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "grad_norm": 1.931730031967163, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "learning_rate": 0.0006894845785850759, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 1.1163417100906372, + "step": 7950 + }, + { + "ce_loss": 0.28073281049728394, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.525175929069519, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.2490440309047699, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 1.207800030708313, + "step": 7950 + }, + { + "ce_loss": 0.3547218441963196, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.5505259037017822, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.24187520146369934, + "step": 7950 + }, + { + "epoch": 2.6551034022681788, + "loss": 1.2237, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "grad_norm": 2.8231544494628906, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "learning_rate": 0.0006891861949146959, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 1.2380450963974, + "step": 7960 + }, + { + "ce_loss": 0.31260645389556885, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.38258302211761475, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.23153412342071533, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 1.231788158416748, + "step": 7960 + }, + { + "ce_loss": 0.307564377784729, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.461772620677948, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.18721505999565125, + "step": 7960 + }, + { + "epoch": 2.658438959306204, + "loss": 1.0779, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "grad_norm": 1.6480693817138672, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "learning_rate": 0.0006888874737329087, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 0.8892300128936768, + "step": 7970 + }, + { + "ce_loss": 0.25990501046180725, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.33113154768943787, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.25036299228668213, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 1.1123273372650146, + "step": 7970 + }, + { + "ce_loss": 0.3147440552711487, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.4517229199409485, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.2724269926548004, + "step": 7970 + }, + { + "epoch": 2.6617745163442295, + "loss": 1.0683, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "grad_norm": 2.092548131942749, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "learning_rate": 0.000688588415388354, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.9862041473388672, + "step": 7980 + }, + { + "ce_loss": 0.31494665145874023, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.45636996626853943, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.21455398201942444, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.9188050031661987, + "step": 7980 + }, + { + "ce_loss": 0.1882411688566208, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.445490300655365, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.1915428638458252, + "step": 7980 + }, + { + "epoch": 2.665110073382255, + "loss": 1.095, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "grad_norm": 2.971047878265381, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "learning_rate": 0.0006882890202300653, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 1.2398712635040283, + "step": 7990 + }, + { + "ce_loss": 0.1910310983657837, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.4677865207195282, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.1882854700088501, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 1.357975721359253, + "step": 7990 + }, + { + "ce_loss": 0.2677784264087677, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.45560523867607117, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.22759392857551575, + "step": 7990 + }, + { + "epoch": 2.66844563042028, + "loss": 1.0626, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "grad_norm": 1.8649590015411377, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "learning_rate": 0.0006879892886074686, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 1.2235169410705566, + "step": 8000 + }, + { + "ce_loss": 0.2407296597957611, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.41924282908439636, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.19686494767665863, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 0.7969648838043213, + "step": 8000 + }, + { + "ce_loss": 0.20408573746681213, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.3923669159412384, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.1550588309764862, + "step": 8000 + }, + { + "epoch": 2.6717811874583055, + "loss": 1.0854, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "grad_norm": 1.7967385053634644, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "learning_rate": 0.0006876892208703833, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.9698286652565002, + "step": 8010 + }, + { + "ce_loss": 0.26138097047805786, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.40205711126327515, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.18110914528369904, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.9017637968063354, + "step": 8010 + }, + { + "ce_loss": 0.2703733742237091, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.404426634311676, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.17171475291252136, + "step": 8010 + }, + { + "epoch": 2.675116744496331, + "loss": 1.0782, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "grad_norm": 1.5138238668441772, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "learning_rate": 0.0006873888173690207, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 0.7773812413215637, + "step": 8020 + }, + { + "ce_loss": 0.20126408338546753, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.33068251609802246, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.17736712098121643, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 1.1545333862304688, + "step": 8020 + }, + { + "ce_loss": 0.3212941884994507, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.43402695655822754, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.2017740160226822, + "step": 8020 + }, + { + "epoch": 2.678452301534356, + "loss": 0.9684, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "grad_norm": 1.565229892730713, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "learning_rate": 0.0006870880784539837, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.9166897535324097, + "step": 8030 + }, + { + "ce_loss": 0.24335050582885742, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.4135293960571289, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.18786965310573578, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.891508936882019, + "step": 8030 + }, + { + "ce_loss": 0.2426663637161255, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.37967848777770996, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.18929001688957214, + "step": 8030 + }, + { + "epoch": 2.6817878585723816, + "loss": 1.146, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "grad_norm": 2.639207363128662, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "learning_rate": 0.0006867870044762672, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 1.2057491540908813, + "step": 8040 + }, + { + "ce_loss": 0.27712738513946533, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.40351033210754395, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.2716418206691742, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.9097023606300354, + "step": 8040 + }, + { + "ce_loss": 0.2758481502532959, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.41735514998435974, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.21631482243537903, + "step": 8040 + }, + { + "epoch": 2.685123415610407, + "loss": 1.0075, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "grad_norm": 2.203000783920288, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "learning_rate": 0.0006864855957872571, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 1.179887294769287, + "step": 8050 + }, + { + "ce_loss": 0.3261899948120117, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.4284709393978119, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.23692551255226135, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 0.9125145077705383, + "step": 8050 + }, + { + "ce_loss": 0.26923179626464844, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.3452901244163513, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.23391097784042358, + "step": 8050 + }, + { + "epoch": 2.6884589726484323, + "loss": 1.1154, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "grad_norm": 4.837062835693359, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "learning_rate": 0.0006861838527387296, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.9809233546257019, + "step": 8060 + }, + { + "ce_loss": 0.28013816475868225, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.40941011905670166, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.1691157966852188, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 1.184672236442566, + "step": 8060 + }, + { + "ce_loss": 0.3251360356807709, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.48578941822052, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.18599502742290497, + "step": 8060 + }, + { + "epoch": 2.6917945296864576, + "loss": 1.0589, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "grad_norm": 2.380504846572876, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "learning_rate": 0.0006858817756828511, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.8261231184005737, + "step": 8070 + }, + { + "ce_loss": 0.21635033190250397, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.36327552795410156, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.17453186213970184, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 1.0964490175247192, + "step": 8070 + }, + { + "ce_loss": 0.2572353780269623, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.41422608494758606, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.19209592044353485, + "step": 8070 + }, + { + "epoch": 2.695130086724483, + "loss": 1.1104, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "grad_norm": 1.6355799436569214, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "learning_rate": 0.0006855793649721783, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 1.1212942600250244, + "step": 8080 + }, + { + "ce_loss": 0.2441914677619934, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.44367820024490356, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.18526434898376465, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 1.0341012477874756, + "step": 8080 + }, + { + "ce_loss": 0.2990477681159973, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.4595893919467926, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.21027454733848572, + "step": 8080 + }, + { + "epoch": 2.6984656437625083, + "loss": 1.0644, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "grad_norm": 3.036402702331543, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "learning_rate": 0.000685276620959657, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 1.0285917520523071, + "step": 8090 + }, + { + "ce_loss": 0.23727817833423615, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.3857990801334381, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.18309199810028076, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 1.0543606281280518, + "step": 8090 + }, + { + "ce_loss": 0.2791891396045685, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.4386514723300934, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.20415925979614258, + "step": 8090 + }, + { + "epoch": 2.7018012008005337, + "loss": 1.0951, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "grad_norm": 1.9897665977478027, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "learning_rate": 0.000684973543998622, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.915390133857727, + "step": 8100 + }, + { + "ce_loss": 0.2011374980211258, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.37561312317848206, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.16644081473350525, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 1.2802246809005737, + "step": 8100 + }, + { + "ce_loss": 0.2943989336490631, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.44954586029052734, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.2091292142868042, + "step": 8100 + }, + { + "epoch": 2.705136757838559, + "loss": 1.0826, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "grad_norm": 1.6685622930526733, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "learning_rate": 0.0006846701344427967, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 0.8103539943695068, + "step": 8110 + }, + { + "ce_loss": 0.2346825748682022, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.32806381583213806, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.17953945696353912, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 1.1475951671600342, + "step": 8110 + }, + { + "ce_loss": 0.2751637399196625, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.323055237531662, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.21816794574260712, + "step": 8110 + }, + { + "epoch": 2.7084723148765844, + "loss": 1.0316, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "grad_norm": 17.63924217224121, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "learning_rate": 0.0006843663926462927, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 1.017307996749878, + "step": 8120 + }, + { + "ce_loss": 0.27584630250930786, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.4709221422672272, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.21881969273090363, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 1.410461187362671, + "step": 8120 + }, + { + "ce_loss": 0.3697640299797058, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.5594056248664856, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.2293158620595932, + "step": 8120 + }, + { + "epoch": 2.7118078719146097, + "loss": 1.1385, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "grad_norm": 1.8438341617584229, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "learning_rate": 0.0006840623189636095, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.8964990973472595, + "step": 8130 + }, + { + "ce_loss": 0.22235225141048431, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.3620837926864624, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.13280965387821198, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 1.0640838146209717, + "step": 8130 + }, + { + "ce_loss": 0.20717808604240417, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.38882189989089966, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.1656038612127304, + "step": 8130 + }, + { + "epoch": 2.715143428952635, + "loss": 1.0427, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "grad_norm": 1.8322330713272095, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "learning_rate": 0.0006837579137496336, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 1.317813515663147, + "step": 8140 + }, + { + "ce_loss": 0.35364487767219543, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.48588964343070984, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.22830376029014587, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 1.2036542892456055, + "step": 8140 + }, + { + "ce_loss": 0.2760239243507385, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.4591331481933594, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.24318912625312805, + "step": 8140 + }, + { + "epoch": 2.7184789859906604, + "loss": 1.1119, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "grad_norm": 1.984826922416687, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "learning_rate": 0.0006834531773596388, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 1.117841362953186, + "step": 8150 + }, + { + "ce_loss": 0.32421332597732544, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.47627392411231995, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.23721623420715332, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.6992880702018738, + "step": 8150 + }, + { + "ce_loss": 0.1821093112230301, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.3342851400375366, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.1452513188123703, + "step": 8150 + }, + { + "epoch": 2.7218145430286858, + "loss": 1.0617, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "grad_norm": 2.205428123474121, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "learning_rate": 0.0006831481101492852, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.9553093314170837, + "step": 8160 + }, + { + "ce_loss": 0.28660255670547485, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.4710615277290344, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.1975298523902893, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.9735440611839294, + "step": 8160 + }, + { + "ce_loss": 0.20919136703014374, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.39127933979034424, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.1659659743309021, + "step": 8160 + }, + { + "epoch": 2.725150100066711, + "loss": 1.1133, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "grad_norm": 5.617023944854736, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "learning_rate": 0.000682842712474619, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.883911669254303, + "step": 8170 + }, + { + "ce_loss": 0.24566011130809784, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.40629303455352783, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.186699777841568, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.915961742401123, + "step": 8170 + }, + { + "ce_loss": 0.27848586440086365, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.4564547836780548, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.18077746033668518, + "step": 8170 + }, + { + "epoch": 2.7284856571047365, + "loss": 1.0033, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "grad_norm": 2.667583703994751, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "learning_rate": 0.0006825369846920722, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.8582113981246948, + "step": 8180 + }, + { + "ce_loss": 0.2530399262905121, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.41539907455444336, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.1895999312400818, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 1.4451714754104614, + "step": 8180 + }, + { + "ce_loss": 0.21054810285568237, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.4180562496185303, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.17314787209033966, + "step": 8180 + }, + { + "epoch": 2.731821214142762, + "loss": 1.0923, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "grad_norm": 2.334686279296875, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "learning_rate": 0.0006822309271584622, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 0.958365797996521, + "step": 8190 + }, + { + "ce_loss": 0.3037683665752411, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.41571107506752014, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.186298206448555, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 1.1305922269821167, + "step": 8190 + }, + { + "ce_loss": 0.2983015775680542, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.42953890562057495, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.2160416692495346, + "step": 8190 + }, + { + "epoch": 2.735156771180787, + "loss": 1.1597, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "grad_norm": 2.6440696716308594, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "learning_rate": 0.0006819245402309907, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 1.0382781028747559, + "step": 8200 + }, + { + "ce_loss": 0.267867773771286, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.39029863476753235, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.23540931940078735, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 0.9972876906394958, + "step": 8200 + }, + { + "ce_loss": 0.3134215474128723, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.42370691895484924, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.20167101919651031, + "step": 8200 + }, + { + "epoch": 2.7384923282188125, + "loss": 1.1542, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "grad_norm": 2.106982469558716, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "learning_rate": 0.0006816178242672446, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.9670668244361877, + "step": 8210 + }, + { + "ce_loss": 0.3226143419742584, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.4178784489631653, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.22630652785301208, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 1.2102785110473633, + "step": 8210 + }, + { + "ce_loss": 0.31588634848594666, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.4046383798122406, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.21099373698234558, + "step": 8210 + }, + { + "epoch": 2.741827885256838, + "loss": 1.1407, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "grad_norm": 2.294503688812256, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "learning_rate": 0.000681310779625194, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 0.7896057367324829, + "step": 8220 + }, + { + "ce_loss": 0.22302307188510895, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.4146197736263275, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.15178342163562775, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 1.2498191595077515, + "step": 8220 + }, + { + "ce_loss": 0.31091174483299255, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.4844982624053955, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.22255325317382812, + "step": 8220 + }, + { + "epoch": 2.745163442294863, + "loss": 1.0505, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "grad_norm": 3.4932470321655273, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "learning_rate": 0.0006810034066631935, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.8021654486656189, + "step": 8230 + }, + { + "ce_loss": 0.2264508605003357, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.35844430327415466, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.1736733764410019, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.8777614831924438, + "step": 8230 + }, + { + "ce_loss": 0.29190006852149963, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.39581429958343506, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.18982553482055664, + "step": 8230 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.9949, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "grad_norm": 1.8267552852630615, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "learning_rate": 0.0006806957057399802, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.9348551630973816, + "step": 8240 + }, + { + "ce_loss": 0.2395116537809372, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.43146926164627075, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.18208366632461548, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.9479954838752747, + "step": 8240 + }, + { + "ce_loss": 0.2667170464992523, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.4461016058921814, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.18788105249404907, + "step": 8240 + }, + { + "epoch": 2.751834556370914, + "loss": 1.1488, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "grad_norm": 1.7486326694488525, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "learning_rate": 0.0006803876772146741, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 1.0329201221466064, + "step": 8250 + }, + { + "ce_loss": 0.3410790264606476, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.4854319989681244, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.2022295445203781, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 0.7077986598014832, + "step": 8250 + }, + { + "ce_loss": 0.20355767011642456, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.3223862648010254, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.14093159139156342, + "step": 8250 + }, + { + "epoch": 2.7551701134089392, + "loss": 1.0387, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "grad_norm": 1.5510363578796387, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "learning_rate": 0.0006800793214467776, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.9963008165359497, + "step": 8260 + }, + { + "ce_loss": 0.2472948133945465, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.417851060628891, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.17874693870544434, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 1.1251202821731567, + "step": 8260 + }, + { + "ce_loss": 0.31395453214645386, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.48657265305519104, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.239948108792305, + "step": 8260 + }, + { + "epoch": 2.7585056704469646, + "loss": 1.0674, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "grad_norm": 2.9985060691833496, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "learning_rate": 0.0006797706387961754, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 1.797513484954834, + "step": 8270 + }, + { + "ce_loss": 0.330750048160553, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.6316664814949036, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.2871108651161194, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 1.196751356124878, + "step": 8270 + }, + { + "ce_loss": 0.28149712085723877, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.5029041171073914, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.22309550642967224, + "step": 8270 + }, + { + "epoch": 2.76184122748499, + "loss": 1.1136, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "grad_norm": 1.9858359098434448, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "learning_rate": 0.0006794616296231331, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 1.0421366691589355, + "step": 8280 + }, + { + "ce_loss": 0.25688180327415466, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.41856256127357483, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.19284594058990479, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 0.9045016765594482, + "step": 8280 + }, + { + "ce_loss": 0.24156159162521362, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.4102836847305298, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.18480077385902405, + "step": 8280 + }, + { + "epoch": 2.7651767845230153, + "loss": 1.0542, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "grad_norm": 2.0464863777160645, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "learning_rate": 0.0006791522942882976, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 1.046781063079834, + "step": 8290 + }, + { + "ce_loss": 0.2372472733259201, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.48630839586257935, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.21556465327739716, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 1.025158166885376, + "step": 8290 + }, + { + "ce_loss": 0.25177982449531555, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.44891324639320374, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.19279952347278595, + "step": 8290 + }, + { + "epoch": 2.7685123415610406, + "loss": 1.0936, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "grad_norm": 3.3846592903137207, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "learning_rate": 0.000678842633152697, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.7709948420524597, + "step": 8300 + }, + { + "ce_loss": 0.23251450061798096, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.3573760986328125, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.18105630576610565, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.9063900709152222, + "step": 8300 + }, + { + "ce_loss": 0.2689492106437683, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.40178921818733215, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.196784108877182, + "step": 8300 + }, + { + "epoch": 2.771847898599066, + "loss": 0.9933, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "grad_norm": 1.6398341655731201, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "learning_rate": 0.0006785326465777384, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 1.0609391927719116, + "step": 8310 + }, + { + "ce_loss": 0.3316725194454193, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.4921702742576599, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.17898264527320862, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 1.0043429136276245, + "step": 8310 + }, + { + "ce_loss": 0.26267266273498535, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.44388797879219055, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.1953670084476471, + "step": 8310 + }, + { + "epoch": 2.7751834556370913, + "loss": 1.0848, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "grad_norm": 2.018314838409424, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "learning_rate": 0.0006782223349252101, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.9457603096961975, + "step": 8320 + }, + { + "ce_loss": 0.22221912443637848, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.46346578001976013, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.24278980493545532, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.9505172967910767, + "step": 8320 + }, + { + "ce_loss": 0.3089073598384857, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.41338473558425903, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.17162297666072845, + "step": 8320 + }, + { + "epoch": 2.7785190126751167, + "loss": 1.1974, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "grad_norm": 2.569556474685669, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "learning_rate": 0.0006779116985572789, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.789454996585846, + "step": 8330 + }, + { + "ce_loss": 0.24249128997325897, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.34002089500427246, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.20667454600334167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 1.1776317358016968, + "step": 8330 + }, + { + "ce_loss": 0.35128793120384216, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.4592365324497223, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.21480117738246918, + "step": 8330 + }, + { + "epoch": 2.781854569713142, + "loss": 1.0435, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "grad_norm": 1.6675870418548584, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "learning_rate": 0.0006776007378364909, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 1.3947495222091675, + "step": 8340 + }, + { + "ce_loss": 0.25217753648757935, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.3868056535720825, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.16207663714885712, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 0.9727454781532288, + "step": 8340 + }, + { + "ce_loss": 0.34851181507110596, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.3823917806148529, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.18593260645866394, + "step": 8340 + }, + { + "epoch": 2.7851901267511674, + "loss": 1.0944, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "grad_norm": 1.8214737176895142, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "learning_rate": 0.0006772894531257709, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 1.036094069480896, + "step": 8350 + }, + { + "ce_loss": 0.31758782267570496, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.40042489767074585, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.2421387881040573, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 1.1778686046600342, + "step": 8350 + }, + { + "ce_loss": 0.22891831398010254, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.37844356894493103, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.20239907503128052, + "step": 8350 + }, + { + "epoch": 2.7885256837891927, + "loss": 1.1012, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "grad_norm": 2.9142262935638428, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "learning_rate": 0.0006769778447884214, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.988903820514679, + "step": 8360 + }, + { + "ce_loss": 0.2615964412689209, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.437855064868927, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.17277489602565765, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.9511721730232239, + "step": 8360 + }, + { + "ce_loss": 0.26001596450805664, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.46625223755836487, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.22466371953487396, + "step": 8360 + }, + { + "epoch": 2.791861240827218, + "loss": 1.0732, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "grad_norm": 2.446507453918457, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "learning_rate": 0.000676665913188123, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 1.571579933166504, + "step": 8370 + }, + { + "ce_loss": 0.23533394932746887, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.4526551067829132, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.20821493864059448, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 1.2163251638412476, + "step": 8370 + }, + { + "ce_loss": 0.2753696143627167, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.42554229497909546, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.23759661614894867, + "step": 8370 + }, + { + "epoch": 2.7951967978652434, + "loss": 1.1284, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "grad_norm": 3.0520758628845215, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "learning_rate": 0.0006763536586889335, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 1.0454256534576416, + "step": 8380 + }, + { + "ce_loss": 0.29537466168403625, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.39937081933021545, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.24960553646087646, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.8441163897514343, + "step": 8380 + }, + { + "ce_loss": 0.24988539516925812, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.29676154255867004, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.18834713101387024, + "step": 8380 + }, + { + "epoch": 2.798532354903269, + "loss": 1.0066, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "grad_norm": 1.6638082265853882, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "learning_rate": 0.0006760410816552874, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.981625497341156, + "step": 8390 + }, + { + "ce_loss": 0.2807079553604126, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.37848836183547974, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.20049092173576355, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.9530798196792603, + "step": 8390 + }, + { + "ce_loss": 0.2755109965801239, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.42244938015937805, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.18968243896961212, + "step": 8390 + }, + { + "epoch": 2.801867911941294, + "loss": 1.0347, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "grad_norm": 2.0404865741729736, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "learning_rate": 0.0006757281824519958, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.8567011952400208, + "step": 8400 + }, + { + "ce_loss": 0.1716955602169037, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.38128066062927246, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.16394934058189392, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.983532726764679, + "step": 8400 + }, + { + "ce_loss": 0.26755788922309875, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.5026155114173889, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.21269215643405914, + "step": 8400 + }, + { + "epoch": 2.8052034689793195, + "loss": 1.0684, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "grad_norm": 5.197111129760742, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "learning_rate": 0.0006754149614442457, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.7837503552436829, + "step": 8410 + }, + { + "ce_loss": 0.2317720502614975, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.3728940486907959, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.17860761284828186, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.89178866147995, + "step": 8410 + }, + { + "ce_loss": 0.22866491973400116, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.46962761878967285, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.1930232048034668, + "step": 8410 + }, + { + "epoch": 2.808539026017345, + "loss": 1.1309, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "grad_norm": 3.5530993938446045, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "learning_rate": 0.0006751014189975995, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 1.0465023517608643, + "step": 8420 + }, + { + "ce_loss": 0.2774046063423157, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.3714715242385864, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.22385916113853455, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 1.0804543495178223, + "step": 8420 + }, + { + "ce_loss": 0.2771798372268677, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.4003911018371582, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.18247495591640472, + "step": 8420 + }, + { + "epoch": 2.81187458305537, + "loss": 1.1442, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "grad_norm": 3.6089842319488525, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "learning_rate": 0.0006747875554779955, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 1.0301923751831055, + "step": 8430 + }, + { + "ce_loss": 0.3191302716732025, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.394956111907959, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.18678748607635498, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 0.7958675026893616, + "step": 8430 + }, + { + "ce_loss": 0.2141757309436798, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.33181634545326233, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.13455720245838165, + "step": 8430 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.9705, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "grad_norm": 2.648263931274414, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "learning_rate": 0.0006744733712517457, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.9274229407310486, + "step": 8440 + }, + { + "ce_loss": 0.2939596474170685, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.42375898361206055, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.1693873107433319, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.9380104541778564, + "step": 8440 + }, + { + "ce_loss": 0.3217563033103943, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.3950371742248535, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.22082169353961945, + "step": 8440 + }, + { + "epoch": 2.818545697131421, + "loss": 1.0684, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "grad_norm": 4.051844120025635, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "learning_rate": 0.0006741588666855371, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 1.2058062553405762, + "step": 8450 + }, + { + "ce_loss": 0.24386708438396454, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.4271876811981201, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.1798713207244873, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 0.9697070121765137, + "step": 8450 + }, + { + "ce_loss": 0.2771538496017456, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.3977970480918884, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.18675166368484497, + "step": 8450 + }, + { + "epoch": 2.8218812541694462, + "loss": 1.0559, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "grad_norm": 4.089424133300781, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "learning_rate": 0.0006738440421464305, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.9530472159385681, + "step": 8460 + }, + { + "ce_loss": 0.2687753140926361, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.4895554482936859, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.19376049935817719, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 1.1800388097763062, + "step": 8460 + }, + { + "ce_loss": 0.32920578122138977, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.47976043820381165, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.2260797768831253, + "step": 8460 + }, + { + "epoch": 2.8252168112074716, + "loss": 1.0713, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "grad_norm": 2.832854747772217, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "learning_rate": 0.0006735288980018597, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 1.5145750045776367, + "step": 8470 + }, + { + "ce_loss": 0.22480595111846924, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.36506372690200806, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.24038757383823395, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.9549174904823303, + "step": 8470 + }, + { + "ce_loss": 0.25437024235725403, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.42316046357154846, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.2307279109954834, + "step": 8470 + }, + { + "epoch": 2.828552368245497, + "loss": 1.0884, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "grad_norm": 1.985348105430603, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "learning_rate": 0.000673213434619632, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 1.0262728929519653, + "step": 8480 + }, + { + "ce_loss": 0.28059670329093933, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.40128839015960693, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.21130184829235077, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 0.8576571345329285, + "step": 8480 + }, + { + "ce_loss": 0.25547927618026733, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.3276759684085846, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.24074243009090424, + "step": 8480 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.9914, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "grad_norm": 1.8692268133163452, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "learning_rate": 0.0006728976523679272, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.9765045046806335, + "step": 8490 + }, + { + "ce_loss": 0.32396554946899414, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.4324900507926941, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.21977542340755463, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 1.1541446447372437, + "step": 8490 + }, + { + "ce_loss": 0.2842647433280945, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.47232362627983093, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.24020341038703918, + "step": 8490 + }, + { + "epoch": 2.8352234823215476, + "loss": 1.0561, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "grad_norm": 1.4203295707702637, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "learning_rate": 0.0006725815516152972, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.919762909412384, + "step": 8500 + }, + { + "ce_loss": 0.2890332043170929, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.37516260147094727, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.1926632821559906, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 1.1556134223937988, + "step": 8500 + }, + { + "ce_loss": 0.30083900690078735, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.4435248374938965, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.21375201642513275, + "step": 8500 + }, + { + "epoch": 2.838559039359573, + "loss": 1.0273, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "grad_norm": 1.6670829057693481, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "learning_rate": 0.0006722651327306654, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 0.9069607257843018, + "step": 8510 + }, + { + "ce_loss": 0.29646047949790955, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.38407665491104126, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.22544215619564056, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 1.033137321472168, + "step": 8510 + }, + { + "ce_loss": 0.28312405943870544, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.44431746006011963, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.24237962067127228, + "step": 8510 + }, + { + "epoch": 2.8418945963975983, + "loss": 1.0609, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "grad_norm": 2.3371927738189697, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "learning_rate": 0.0006719483960833267, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 1.079877257347107, + "step": 8520 + }, + { + "ce_loss": 0.2666374146938324, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.47163528203964233, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.2189643830060959, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 1.252240777015686, + "step": 8520 + }, + { + "ce_loss": 0.3274886906147003, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.48647746443748474, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.19743898510932922, + "step": 8520 + }, + { + "epoch": 2.8452301534356237, + "loss": 1.1152, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "grad_norm": 3.188063383102417, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "learning_rate": 0.0006716313420429469, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 1.0578892230987549, + "step": 8530 + }, + { + "ce_loss": 0.27939069271087646, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.41141924262046814, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.23937836289405823, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 1.0810478925704956, + "step": 8530 + }, + { + "ce_loss": 0.2958105802536011, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.4896780252456665, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.22165045142173767, + "step": 8530 + }, + { + "epoch": 2.848565710473649, + "loss": 1.0383, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "grad_norm": 2.313164234161377, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "learning_rate": 0.0006713139709795621, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 1.241434931755066, + "step": 8540 + }, + { + "ce_loss": 0.3561538755893707, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.5506157279014587, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.2594704031944275, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 0.9683080911636353, + "step": 8540 + }, + { + "ce_loss": 0.25911006331443787, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.41315120458602905, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.1843436062335968, + "step": 8540 + }, + { + "epoch": 2.8519012675116744, + "loss": 1.0649, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "grad_norm": 1.7151241302490234, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "learning_rate": 0.0006709962832635789, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 1.2007484436035156, + "step": 8550 + }, + { + "ce_loss": 0.27435940504074097, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.5308864116668701, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.22154894471168518, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 1.5852737426757812, + "step": 8550 + }, + { + "ce_loss": 0.2574402391910553, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.3926985263824463, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.2215213030576706, + "step": 8550 + }, + { + "epoch": 2.8552368245496997, + "loss": 1.1423, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "grad_norm": 2.2228615283966064, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "learning_rate": 0.0006706782792657725, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 1.118459939956665, + "step": 8560 + }, + { + "ce_loss": 0.3260303735733032, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.48777714371681213, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.23019662499427795, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.9366552233695984, + "step": 8560 + }, + { + "ce_loss": 0.2822265326976776, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.4189026951789856, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.23510710895061493, + "step": 8560 + }, + { + "epoch": 2.858572381587725, + "loss": 1.0516, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "grad_norm": 1.732473373413086, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "learning_rate": 0.0006703599593572881, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 1.032302975654602, + "step": 8570 + }, + { + "ce_loss": 0.29478833079338074, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.5042816400527954, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.1814073920249939, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 0.9531590938568115, + "step": 8570 + }, + { + "ce_loss": 0.20203471183776855, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.4454876184463501, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.24680931866168976, + "step": 8570 + }, + { + "epoch": 2.8619079386257504, + "loss": 1.0961, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "grad_norm": 1.977342963218689, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "learning_rate": 0.000670041323909639, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 1.0410369634628296, + "step": 8580 + }, + { + "ce_loss": 0.3173748850822449, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.4767785966396332, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.2090308666229248, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.8746610879898071, + "step": 8580 + }, + { + "ce_loss": 0.216957688331604, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.4328778386116028, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.15189319849014282, + "step": 8580 + }, + { + "epoch": 2.865243495663776, + "loss": 1.107, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "grad_norm": 1.9085386991500854, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "learning_rate": 0.0006697223732947075, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 1.0445042848587036, + "step": 8590 + }, + { + "ce_loss": 0.2784845530986786, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.4100443422794342, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.22100576758384705, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 1.0747194290161133, + "step": 8590 + }, + { + "ce_loss": 0.25558221340179443, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.36318057775497437, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.17570355534553528, + "step": 8590 + }, + { + "epoch": 2.868579052701801, + "loss": 1.063, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "grad_norm": 2.178400754928589, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "learning_rate": 0.000669403107884743, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 0.9615577459335327, + "step": 8600 + }, + { + "ce_loss": 0.2424328625202179, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.3830501139163971, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.28199291229248047, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 1.2277705669403076, + "step": 8600 + }, + { + "ce_loss": 0.2811940610408783, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.35547733306884766, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.23016276955604553, + "step": 8600 + }, + { + "epoch": 2.8719146097398265, + "loss": 1.081, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "grad_norm": 2.179974317550659, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "learning_rate": 0.0006690835280523624, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 1.0438921451568604, + "step": 8610 + }, + { + "ce_loss": 0.27640753984451294, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.43892744183540344, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.2078024297952652, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 1.0383868217468262, + "step": 8610 + }, + { + "ce_loss": 0.3272160589694977, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.3793815076351166, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.24669574201107025, + "step": 8610 + }, + { + "epoch": 2.875250166777852, + "loss": 0.9902, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "grad_norm": 2.012612819671631, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "learning_rate": 0.0006687636341705501, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.9611943364143372, + "step": 8620 + }, + { + "ce_loss": 0.2872672379016876, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.36733901500701904, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.2083406001329422, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.9080339670181274, + "step": 8620 + }, + { + "ce_loss": 0.20383448898792267, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.32772207260131836, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.15557782351970673, + "step": 8620 + }, + { + "epoch": 2.878585723815877, + "loss": 0.9701, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "grad_norm": 2.836735725402832, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "learning_rate": 0.0006684434266126566, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 1.122562050819397, + "step": 8630 + }, + { + "ce_loss": 0.35167136788368225, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.40869832038879395, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.27053526043891907, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 0.8298502564430237, + "step": 8630 + }, + { + "ce_loss": 0.23591288924217224, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.3813800513744354, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.18113718926906586, + "step": 8630 + }, + { + "epoch": 2.8819212808539025, + "loss": 1.0493, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "grad_norm": 1.8520641326904297, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "learning_rate": 0.0006681229057523986, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 1.0482269525527954, + "step": 8640 + }, + { + "ce_loss": 0.24984189867973328, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.45180055499076843, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.20239730179309845, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.9643755555152893, + "step": 8640 + }, + { + "ce_loss": 0.2628970742225647, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.405941903591156, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.2428521364927292, + "step": 8640 + }, + { + "epoch": 2.885256837891928, + "loss": 1.087, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "grad_norm": 3.146603584289551, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "learning_rate": 0.0006678020719638582, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.8171414136886597, + "step": 8650 + }, + { + "ce_loss": 0.24824285507202148, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.38094547390937805, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.18764515221118927, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.9321733117103577, + "step": 8650 + }, + { + "ce_loss": 0.2584330439567566, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.47445717453956604, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.19905956089496613, + "step": 8650 + }, + { + "epoch": 2.8885923949299532, + "loss": 1.0679, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "grad_norm": 2.0428476333618164, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "learning_rate": 0.0006674809256214832, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 1.0615755319595337, + "step": 8660 + }, + { + "ce_loss": 0.3499814569950104, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.40423911809921265, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.23165345191955566, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.8028374314308167, + "step": 8660 + }, + { + "ce_loss": 0.23720194399356842, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.30180805921554565, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.2078477293252945, + "step": 8660 + }, + { + "epoch": 2.8919279519679786, + "loss": 1.1032, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "grad_norm": 4.174248218536377, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "learning_rate": 0.0006671594671000859, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 1.3050706386566162, + "step": 8670 + }, + { + "ce_loss": 0.29188674688339233, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.47933465242385864, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.24941763281822205, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.9996240139007568, + "step": 8670 + }, + { + "ce_loss": 0.23974673449993134, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.4903826415538788, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.20120561122894287, + "step": 8670 + }, + { + "epoch": 2.895263509006004, + "loss": 1.0501, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "grad_norm": 2.0826456546783447, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "learning_rate": 0.000666837696774843, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 1.132706880569458, + "step": 8680 + }, + { + "ce_loss": 0.2618308663368225, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.40235596895217896, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.16059039533138275, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 1.1171619892120361, + "step": 8680 + }, + { + "ce_loss": 0.28033655881881714, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.3423466980457306, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.23442880809307098, + "step": 8680 + }, + { + "epoch": 2.8985990660440293, + "loss": 1.0142, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "grad_norm": 1.619909644126892, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "learning_rate": 0.000666515615021295, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.980444610118866, + "step": 8690 + }, + { + "ce_loss": 0.3060693144798279, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.3914102613925934, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.21149593591690063, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.8747573494911194, + "step": 8690 + }, + { + "ce_loss": 0.2622194290161133, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.35515880584716797, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.20144023001194, + "step": 8690 + }, + { + "epoch": 2.9019346230820546, + "loss": 1.0804, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "grad_norm": 1.6870344877243042, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "learning_rate": 0.0006661932222153459, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.8705916404724121, + "step": 8700 + }, + { + "ce_loss": 0.25805947184562683, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.420367568731308, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.1892354041337967, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.881604790687561, + "step": 8700 + }, + { + "ce_loss": 0.23483748733997345, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.4109418988227844, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.18561656773090363, + "step": 8700 + }, + { + "epoch": 2.90527018012008, + "loss": 1.0964, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "grad_norm": 2.7310404777526855, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "learning_rate": 0.0006658705187332629, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 0.9659359455108643, + "step": 8710 + }, + { + "ce_loss": 0.20541168749332428, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.39143872261047363, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.23953206837177277, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 1.1124589443206787, + "step": 8710 + }, + { + "ce_loss": 0.2792068123817444, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.5193973183631897, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.23650509119033813, + "step": 8710 + }, + { + "epoch": 2.9086057371581053, + "loss": 1.0421, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "grad_norm": 1.4814128875732422, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "learning_rate": 0.0006655475049516757, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 1.0588215589523315, + "step": 8720 + }, + { + "ce_loss": 0.2646660804748535, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.49552783370018005, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.21849878132343292, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.7920655608177185, + "step": 8720 + }, + { + "ce_loss": 0.22781747579574585, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.3877447545528412, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.17634247243404388, + "step": 8720 + }, + { + "epoch": 2.9119412941961307, + "loss": 1.0592, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "grad_norm": 1.987634539604187, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "learning_rate": 0.0006652241812475762, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 1.0713099241256714, + "step": 8730 + }, + { + "ce_loss": 0.22051984071731567, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.35058051347732544, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.18141403794288635, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 1.0249351263046265, + "step": 8730 + }, + { + "ce_loss": 0.22720232605934143, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.39575693011283875, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.22523285448551178, + "step": 8730 + }, + { + "epoch": 2.915276851234156, + "loss": 0.9748, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "grad_norm": 1.5860884189605713, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "learning_rate": 0.0006649005479983179, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.8226218819618225, + "step": 8740 + }, + { + "ce_loss": 0.20729947090148926, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.4032948911190033, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.16638141870498657, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.76434725522995, + "step": 8740 + }, + { + "ce_loss": 0.21286216378211975, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.3148097097873688, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.17197924852371216, + "step": 8740 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.9997, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "grad_norm": 2.5379202365875244, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "learning_rate": 0.0006645766055816155, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 1.0224852561950684, + "step": 8750 + }, + { + "ce_loss": 0.24008406698703766, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.3388434946537018, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.17256751656532288, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 1.160400390625, + "step": 8750 + }, + { + "ce_loss": 0.38929951190948486, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.5098603963851929, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.2422654628753662, + "step": 8750 + }, + { + "epoch": 2.9219479653102067, + "loss": 1.0708, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "grad_norm": 2.1415038108825684, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "learning_rate": 0.0006642523543755449, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.8797162175178528, + "step": 8760 + }, + { + "ce_loss": 0.2512281537055969, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.3936854600906372, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.1777983009815216, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.9649273753166199, + "step": 8760 + }, + { + "ce_loss": 0.23668411374092102, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.4126769006252289, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.23348350822925568, + "step": 8760 + }, + { + "epoch": 2.925283522348232, + "loss": 1.0869, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "grad_norm": 1.9545202255249023, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "learning_rate": 0.0006639277947585419, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 1.432930588722229, + "step": 8770 + }, + { + "ce_loss": 0.3243611454963684, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.5498109459877014, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.27567988634109497, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 1.0950384140014648, + "step": 8770 + }, + { + "ce_loss": 0.2891460359096527, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.544788122177124, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.20279133319854736, + "step": 8770 + }, + { + "epoch": 2.9286190793862574, + "loss": 1.062, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "grad_norm": 1.8844976425170898, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "learning_rate": 0.0006636029271094026, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.8158213496208191, + "step": 8780 + }, + { + "ce_loss": 0.24880477786064148, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.3861525058746338, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.17992152273654938, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.9338173270225525, + "step": 8780 + }, + { + "ce_loss": 0.27490976452827454, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.4086349904537201, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.19639791548252106, + "step": 8780 + }, + { + "epoch": 2.931954636424283, + "loss": 0.9566, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "grad_norm": 1.5922753810882568, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "learning_rate": 0.0006632777518072826, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 1.1115561723709106, + "step": 8790 + }, + { + "ce_loss": 0.32640036940574646, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.457407683134079, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.24649685621261597, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 0.9608051776885986, + "step": 8790 + }, + { + "ce_loss": 0.20317061245441437, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.42976054549217224, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.22405625879764557, + "step": 8790 + }, + { + "epoch": 2.935290193462308, + "loss": 1.0111, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "grad_norm": 1.4414767026901245, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "learning_rate": 0.0006629522692316964, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.8295692801475525, + "step": 8800 + }, + { + "ce_loss": 0.2127918004989624, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.34422290325164795, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.22246137261390686, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.9614005088806152, + "step": 8800 + }, + { + "ce_loss": 0.310360848903656, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.3648439347743988, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.22835128009319305, + "step": 8800 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.9828, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "grad_norm": 2.4150519371032715, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "learning_rate": 0.0006626264797625171, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.7496489882469177, + "step": 8810 + }, + { + "ce_loss": 0.2132815569639206, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.3698428273200989, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.16614781320095062, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 1.0606884956359863, + "step": 8810 + }, + { + "ce_loss": 0.2448640763759613, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.34551236033439636, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.16551008820533752, + "step": 8810 + }, + { + "epoch": 2.941961307538359, + "loss": 0.9164, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "grad_norm": 2.078324317932129, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "learning_rate": 0.0006623003837799761, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 1.024645209312439, + "step": 8820 + }, + { + "ce_loss": 0.27640199661254883, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.4167477488517761, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.19014853239059448, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 0.6186550259590149, + "step": 8820 + }, + { + "ce_loss": 0.16880573332309723, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.2792816758155823, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.1264185756444931, + "step": 8820 + }, + { + "epoch": 2.945296864576384, + "loss": 1.0886, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "grad_norm": 1.649694800376892, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "learning_rate": 0.0006619739816646626, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 1.1843812465667725, + "step": 8830 + }, + { + "ce_loss": 0.2780088484287262, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.46142643690109253, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.19550113379955292, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 1.022020697593689, + "step": 8830 + }, + { + "ce_loss": 0.28591063618659973, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.3770048916339874, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.2763281464576721, + "step": 8830 + }, + { + "epoch": 2.9486324216144095, + "loss": 1.0154, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "grad_norm": 2.126565456390381, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "learning_rate": 0.000661647273797523, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 1.2838503122329712, + "step": 8840 + }, + { + "ce_loss": 0.24286939203739166, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.6261106729507446, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.17261916399002075, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 1.2399978637695312, + "step": 8840 + }, + { + "ce_loss": 0.2863895297050476, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.46719643473625183, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.24916110932826996, + "step": 8840 + }, + { + "epoch": 2.951967978652435, + "loss": 1.1672, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "grad_norm": 2.514310359954834, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "learning_rate": 0.0006613202605598604, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 1.3793586492538452, + "step": 8850 + }, + { + "ce_loss": 0.25462037324905396, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.6040604710578918, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.2562090754508972, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 0.9639979600906372, + "step": 8850 + }, + { + "ce_loss": 0.21080300211906433, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.4669973850250244, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.2147378921508789, + "step": 8850 + }, + { + "epoch": 2.9553035356904602, + "loss": 1.0595, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "grad_norm": 1.8250327110290527, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "learning_rate": 0.0006609929423333345, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.8667522668838501, + "step": 8860 + }, + { + "ce_loss": 0.23248571157455444, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.4130263924598694, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.22097499668598175, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.9345153570175171, + "step": 8860 + }, + { + "ce_loss": 0.28272318840026855, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.4565563201904297, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.19382339715957642, + "step": 8860 + }, + { + "epoch": 2.9586390927284856, + "loss": 1.1138, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "grad_norm": 2.332496404647827, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "learning_rate": 0.0006606653194999608, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 1.3785479068756104, + "step": 8870 + }, + { + "ce_loss": 0.2756645977497101, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.4183840751647949, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.2372225672006607, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 1.4631519317626953, + "step": 8870 + }, + { + "ce_loss": 0.2707836925983429, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.5820114612579346, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.18649905920028687, + "step": 8870 + }, + { + "epoch": 2.961974649766511, + "loss": 1.0423, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "grad_norm": 1.7435004711151123, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "learning_rate": 0.0006603373924421106, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 0.9438967704772949, + "step": 8880 + }, + { + "ce_loss": 0.23838581144809723, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.3440316319465637, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.22295810282230377, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 1.1735258102416992, + "step": 8880 + }, + { + "ce_loss": 0.30681556463241577, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.4880583882331848, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.21688827872276306, + "step": 8880 + }, + { + "epoch": 2.9653102068045363, + "loss": 1.0695, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "grad_norm": 2.8541676998138428, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "learning_rate": 0.00066000916154251, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 1.5222339630126953, + "step": 8890 + }, + { + "ce_loss": 0.3295680284500122, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.46733707189559937, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.20860373973846436, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 1.3878238201141357, + "step": 8890 + }, + { + "ce_loss": 0.277427077293396, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.44741156697273254, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.17672061920166016, + "step": 8890 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.9985, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "grad_norm": 1.472333312034607, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "learning_rate": 0.0006596806271842397, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 1.2768826484680176, + "step": 8900 + }, + { + "ce_loss": 0.37654709815979004, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.4484732151031494, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.2331361323595047, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.9021540284156799, + "step": 8900 + }, + { + "ce_loss": 0.31903746724128723, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.31807395815849304, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.26460903882980347, + "step": 8900 + }, + { + "epoch": 2.971981320880587, + "loss": 1.0338, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "grad_norm": 2.5841994285583496, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "learning_rate": 0.0006593517897507345, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 0.9933534264564514, + "step": 8910 + }, + { + "ce_loss": 0.2849483788013458, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.4897994101047516, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.17452286183834076, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 1.0774074792861938, + "step": 8910 + }, + { + "ce_loss": 0.2757338881492615, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.4135061800479889, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.18409499526023865, + "step": 8910 + }, + { + "epoch": 2.9753168779186123, + "loss": 1.0067, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "grad_norm": 1.905840277671814, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "learning_rate": 0.0006590226496257835, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.9164485931396484, + "step": 8920 + }, + { + "ce_loss": 0.30785995721817017, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.44607120752334595, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.16198371350765228, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 1.0075207948684692, + "step": 8920 + }, + { + "ce_loss": 0.2652585506439209, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.4635201096534729, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.2188246250152588, + "step": 8920 + }, + { + "epoch": 2.9786524349566377, + "loss": 1.1102, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "grad_norm": 3.0330445766448975, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "learning_rate": 0.0006586932071935284, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.9932916164398193, + "step": 8930 + }, + { + "ce_loss": 0.2821085453033447, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.47245728969573975, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.1981649547815323, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 1.2256691455841064, + "step": 8930 + }, + { + "ce_loss": 0.2617366313934326, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.4330045282840729, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.21306806802749634, + "step": 8930 + }, + { + "epoch": 2.981987991994663, + "loss": 1.1046, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "grad_norm": 2.6775052547454834, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "learning_rate": 0.0006583634628384638, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 1.0053170919418335, + "step": 8940 + }, + { + "ce_loss": 0.30795395374298096, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.45006781816482544, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.24674780666828156, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 0.7464403510093689, + "step": 8940 + }, + { + "ce_loss": 0.20934158563613892, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.35018253326416016, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.15509063005447388, + "step": 8940 + }, + { + "epoch": 2.9853235490326884, + "loss": 1.0318, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "grad_norm": 2.196390390396118, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "learning_rate": 0.0006580334169454372, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 1.082023024559021, + "step": 8950 + }, + { + "ce_loss": 0.29764267802238464, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.5086228847503662, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.2039090394973755, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.8325173258781433, + "step": 8950 + }, + { + "ce_loss": 0.25651320815086365, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.4119528830051422, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.16320902109146118, + "step": 8950 + }, + { + "epoch": 2.9886591060707137, + "loss": 1.1138, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "grad_norm": 2.025916337966919, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "learning_rate": 0.0006577030698996472, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 1.0703673362731934, + "step": 8960 + }, + { + "ce_loss": 0.28445735573768616, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.5076282024383545, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.2156871259212494, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 1.503796100616455, + "step": 8960 + }, + { + "ce_loss": 0.2984296977519989, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.572968065738678, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.27950987219810486, + "step": 8960 + }, + { + "epoch": 2.991994663108739, + "loss": 1.0775, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "grad_norm": 2.651618242263794, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "learning_rate": 0.0006573724220866448, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 1.0260261297225952, + "step": 8970 + }, + { + "ce_loss": 0.3148878514766693, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.5199999809265137, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.19105911254882812, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 0.9962505102157593, + "step": 8970 + }, + { + "ce_loss": 0.3074457347393036, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.4438781142234802, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.20244362950325012, + "step": 8970 + }, + { + "epoch": 2.9953302201467644, + "loss": 1.0804, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "grad_norm": 1.9578343629837036, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "learning_rate": 0.0006570414738923314, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.9544330835342407, + "step": 8980 + }, + { + "ce_loss": 0.3170566260814667, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.43722185492515564, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.2000456601381302, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 1.120278239250183, + "step": 8980 + }, + { + "ce_loss": 0.273532897233963, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.42763447761535645, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.2289534956216812, + "step": 8980 + }, + { + "epoch": 2.9986657771847898, + "loss": 1.0876, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "grad_norm": 1.487807035446167, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "learning_rate": 0.0006567102257029592, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 1.4187259674072266, + "step": 8990 + }, + { + "ce_loss": 0.1924740970134735, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.39415788650512695, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.23679277300834656, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.8554670810699463, + "step": 8990 + }, + { + "ce_loss": 0.24595953524112701, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.45313510298728943, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.15625616908073425, + "step": 8990 + }, + { + "epoch": 3.002001334222815, + "loss": 1.112, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "grad_norm": 2.9651665687561035, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "learning_rate": 0.0006563786779051305, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 0.919062077999115, + "step": 9000 + }, + { + "ce_loss": 0.2220434844493866, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.47971224784851074, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.1729557067155838, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 1.1402512788772583, + "step": 9000 + }, + { + "ce_loss": 0.3153040111064911, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.542543888092041, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.22358056902885437, + "step": 9000 + }, + { + "epoch": 3.0053368912608405, + "loss": 1.0866, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "grad_norm": 2.7941534519195557, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "learning_rate": 0.0006560468308857971, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 1.3257273435592651, + "step": 9010 + }, + { + "ce_loss": 0.33371102809906006, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.4266383647918701, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.22311057150363922, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 1.0651869773864746, + "step": 9010 + }, + { + "ce_loss": 0.3537583649158478, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.5079278349876404, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.20298175513744354, + "step": 9010 + }, + { + "epoch": 3.008672448298866, + "loss": 1.0502, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "grad_norm": 1.7431156635284424, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "learning_rate": 0.0006557146850322601, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 1.0537315607070923, + "step": 9020 + }, + { + "ce_loss": 0.29217228293418884, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.4923178255558014, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.22270143032073975, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 0.8989039063453674, + "step": 9020 + }, + { + "ce_loss": 0.2505379617214203, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.41707292199134827, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.18117381632328033, + "step": 9020 + }, + { + "epoch": 3.012008005336891, + "loss": 1.0364, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "grad_norm": 1.988556146621704, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "learning_rate": 0.0006553822407321699, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 0.9533810615539551, + "step": 9030 + }, + { + "ce_loss": 0.2103157341480255, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.41822630167007446, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.19332677125930786, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 1.106896996498108, + "step": 9030 + }, + { + "ce_loss": 0.2565533220767975, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.5410886406898499, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.23619408905506134, + "step": 9030 + }, + { + "epoch": 3.0153435623749165, + "loss": 1.0529, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "grad_norm": 2.4528582096099854, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "learning_rate": 0.0006550494983735243, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 1.010316252708435, + "step": 9040 + }, + { + "ce_loss": 0.3083910048007965, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.49507591128349304, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.20631785690784454, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.9693584442138672, + "step": 9040 + }, + { + "ce_loss": 0.2823527157306671, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.435078889131546, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.20153407752513885, + "step": 9040 + }, + { + "epoch": 3.018679119412942, + "loss": 0.9921, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "grad_norm": 2.484927177429199, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "learning_rate": 0.0006547164583446698, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 1.0611393451690674, + "step": 9050 + }, + { + "ce_loss": 0.3461135923862457, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.5212705135345459, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.19343295693397522, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 1.5962610244750977, + "step": 9050 + }, + { + "ce_loss": 0.3070935606956482, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.5359688997268677, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.2690057158470154, + "step": 9050 + }, + { + "epoch": 3.022014676450967, + "loss": 1.1188, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "grad_norm": 2.0018575191497803, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "learning_rate": 0.0006543831210342998, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 1.0536315441131592, + "step": 9060 + }, + { + "ce_loss": 0.33707576990127563, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.4651211202144623, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.19382964074611664, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 1.1964068412780762, + "step": 9060 + }, + { + "ce_loss": 0.27716806530952454, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.4435596168041229, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.2285054624080658, + "step": 9060 + }, + { + "epoch": 3.0253502334889926, + "loss": 1.0283, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "grad_norm": 1.938596487045288, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "learning_rate": 0.0006540494868314547, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.9109571576118469, + "step": 9070 + }, + { + "ce_loss": 0.16253161430358887, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.41372305154800415, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.20183952152729034, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.8401382565498352, + "step": 9070 + }, + { + "ce_loss": 0.20185644924640656, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.4689839482307434, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.16874492168426514, + "step": 9070 + }, + { + "epoch": 3.028685790527018, + "loss": 1.0059, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "grad_norm": 1.7816444635391235, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "learning_rate": 0.0006537155561255215, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 0.954689085483551, + "step": 9080 + }, + { + "ce_loss": 0.25480887293815613, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.4411703050136566, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.1994798481464386, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 1.000085473060608, + "step": 9080 + }, + { + "ce_loss": 0.2905219495296478, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.43431705236434937, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.2110050618648529, + "step": 9080 + }, + { + "epoch": 3.0320213475650433, + "loss": 1.0526, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "grad_norm": 1.6663061380386353, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "learning_rate": 0.0006533813293062336, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 1.300847053527832, + "step": 9090 + }, + { + "ce_loss": 0.25688230991363525, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.37179258465766907, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.18998339772224426, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 1.0649999380111694, + "step": 9090 + }, + { + "ce_loss": 0.24274015426635742, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.41353946924209595, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.23726420104503632, + "step": 9090 + }, + { + "epoch": 3.0353569046030686, + "loss": 1.0777, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "grad_norm": 2.572343349456787, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "learning_rate": 0.0006530468067636693, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 1.0435293912887573, + "step": 9100 + }, + { + "ce_loss": 0.32857900857925415, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.4863881766796112, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.19983501732349396, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 1.0608022212982178, + "step": 9100 + }, + { + "ce_loss": 0.26578211784362793, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.44077160954475403, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.2041448950767517, + "step": 9100 + }, + { + "epoch": 3.038692461641094, + "loss": 1.0447, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "grad_norm": 2.524587631225586, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "learning_rate": 0.0006527119888882527, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 1.3836387395858765, + "step": 9110 + }, + { + "ce_loss": 0.31120729446411133, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.48757538199424744, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.20786985754966736, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 1.017317295074463, + "step": 9110 + }, + { + "ce_loss": 0.25050947070121765, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.4317574203014374, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.26842379570007324, + "step": 9110 + }, + { + "epoch": 3.0420280186791193, + "loss": 1.0766, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "grad_norm": 2.0388600826263428, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "learning_rate": 0.0006523768760707519, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 1.0211807489395142, + "step": 9120 + }, + { + "ce_loss": 0.2478404939174652, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.37502074241638184, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.23659591376781464, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.7845948338508606, + "step": 9120 + }, + { + "ce_loss": 0.18987131118774414, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.3439366817474365, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.17155510187149048, + "step": 9120 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.9621, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "grad_norm": 2.8099639415740967, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "learning_rate": 0.00065204146870228, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 1.3765853643417358, + "step": 9130 + }, + { + "ce_loss": 0.2633730173110962, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.40176552534103394, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.18845759332180023, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 1.0317814350128174, + "step": 9130 + }, + { + "ce_loss": 0.30526190996170044, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.4671580195426941, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.19728983938694, + "step": 9130 + }, + { + "epoch": 3.04869913275517, + "loss": 1.0236, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "grad_norm": 1.629594326019287, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "learning_rate": 0.0006517057671742934, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 1.2944269180297852, + "step": 9140 + }, + { + "ce_loss": 0.3126814067363739, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.4917336702346802, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.20706507563591003, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 1.0209884643554688, + "step": 9140 + }, + { + "ce_loss": 0.23382870852947235, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.3408361077308655, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.1909978985786438, + "step": 9140 + }, + { + "epoch": 3.0520346897931954, + "loss": 1.0116, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "grad_norm": 2.274972915649414, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "learning_rate": 0.0006513697718785917, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.8121439218521118, + "step": 9150 + }, + { + "ce_loss": 0.20182228088378906, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.36416077613830566, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.19133667647838593, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.9841737747192383, + "step": 9150 + }, + { + "ce_loss": 0.20552538335323334, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.41570305824279785, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.16839125752449036, + "step": 9150 + }, + { + "epoch": 3.0553702468312207, + "loss": 1.0607, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "grad_norm": 2.0157339572906494, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "learning_rate": 0.0006510334832073179, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.8094881176948547, + "step": 9160 + }, + { + "ce_loss": 0.200615793466568, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.3362140357494354, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.1586102694272995, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 1.5575326681137085, + "step": 9160 + }, + { + "ce_loss": 0.27797114849090576, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.4377124011516571, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.1744309812784195, + "step": 9160 + }, + { + "epoch": 3.058705803869246, + "loss": 1.0709, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "grad_norm": 3.011078357696533, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "learning_rate": 0.0006506969015529567, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.6737293601036072, + "step": 9170 + }, + { + "ce_loss": 0.14504103362560272, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.3356139659881592, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.15358547866344452, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.9121361970901489, + "step": 9170 + }, + { + "ce_loss": 0.22314849495887756, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.39828944206237793, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.22030402719974518, + "step": 9170 + }, + { + "epoch": 3.0620413609072714, + "loss": 1.0014, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "grad_norm": 2.376568078994751, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "learning_rate": 0.0006503600273083354, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 1.095672369003296, + "step": 9180 + }, + { + "ce_loss": 0.2906176447868347, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.4271732270717621, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.2026030719280243, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 1.2553006410598755, + "step": 9180 + }, + { + "ce_loss": 0.26068201661109924, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.3658229410648346, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.19992195069789886, + "step": 9180 + }, + { + "epoch": 3.0653769179452968, + "loss": 1.0042, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "grad_norm": 1.8117719888687134, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "learning_rate": 0.0006500228608666222, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 1.051975965499878, + "step": 9190 + }, + { + "ce_loss": 0.31874123215675354, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.4895990788936615, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.1792144477367401, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.8689483404159546, + "step": 9190 + }, + { + "ce_loss": 0.2500791549682617, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.3955501616001129, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.17629677057266235, + "step": 9190 + }, + { + "epoch": 3.068712474983322, + "loss": 0.9835, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "grad_norm": 2.159898519515991, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "learning_rate": 0.0006496854026213269, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.9245696663856506, + "step": 9200 + }, + { + "ce_loss": 0.2620841860771179, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.39770200848579407, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.19167114794254303, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.8449506163597107, + "step": 9200 + }, + { + "ce_loss": 0.20318298041820526, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.3830258846282959, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.19371457397937775, + "step": 9200 + }, + { + "epoch": 3.0720480320213475, + "loss": 1.0434, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "grad_norm": 2.69889760017395, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "learning_rate": 0.0006493476529662996, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.9485535621643066, + "step": 9210 + }, + { + "ce_loss": 0.19093436002731323, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.4069599509239197, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.14216384291648865, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 1.319540023803711, + "step": 9210 + }, + { + "ce_loss": 0.33737826347351074, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.4702030122280121, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.23823755979537964, + "step": 9210 + }, + { + "epoch": 3.075383589059373, + "loss": 1.1373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "grad_norm": 2.137755870819092, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "learning_rate": 0.0006490096122957303, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 1.3876944780349731, + "step": 9220 + }, + { + "ce_loss": 0.3482910692691803, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.5636916160583496, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.2291729897260666, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 1.1224955320358276, + "step": 9220 + }, + { + "ce_loss": 0.27185824513435364, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.47210192680358887, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.1957308053970337, + "step": 9220 + }, + { + "epoch": 3.078719146097398, + "loss": 1.136, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "grad_norm": 1.62136709690094, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "learning_rate": 0.0006486712810041488, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 1.64496910572052, + "step": 9230 + }, + { + "ce_loss": 0.28236740827560425, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.4360547959804535, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.2266317903995514, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 1.0641510486602783, + "step": 9230 + }, + { + "ce_loss": 0.32486236095428467, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.48013004660606384, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.20147085189819336, + "step": 9230 + }, + { + "epoch": 3.0820547031354235, + "loss": 1.1094, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "grad_norm": 1.375726580619812, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "learning_rate": 0.0006483326594864243, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 1.3441393375396729, + "step": 9240 + }, + { + "ce_loss": 0.32190045714378357, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.40649378299713135, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.2810215353965759, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 1.1238963603973389, + "step": 9240 + }, + { + "ce_loss": 0.2841211259365082, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.38098785281181335, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.2101936638355255, + "step": 9240 + }, + { + "epoch": 3.085390260173449, + "loss": 1.055, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "grad_norm": 1.4783991575241089, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "learning_rate": 0.0006479937481377644, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 0.8815041780471802, + "step": 9250 + }, + { + "ce_loss": 0.2548271119594574, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.43178367614746094, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.1447189897298813, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 1.0925407409667969, + "step": 9250 + }, + { + "ce_loss": 0.22870661318302155, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.4754504859447479, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.21979738771915436, + "step": 9250 + }, + { + "epoch": 3.088725817211474, + "loss": 1.0436, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "grad_norm": 2.365377187728882, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "learning_rate": 0.0006476545473537153, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 0.7632279396057129, + "step": 9260 + }, + { + "ce_loss": 0.19780153036117554, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.3653009235858917, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.19955159723758698, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 1.241063117980957, + "step": 9260 + }, + { + "ce_loss": 0.2871393859386444, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.47732460498809814, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.21136754751205444, + "step": 9260 + }, + { + "epoch": 3.0920613742494996, + "loss": 1.052, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "grad_norm": 1.8858585357666016, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "learning_rate": 0.0006473150575301607, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.9567256569862366, + "step": 9270 + }, + { + "ce_loss": 0.2105388045310974, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.3784027099609375, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.189395472407341, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 1.0037850141525269, + "step": 9270 + }, + { + "ce_loss": 0.2868437170982361, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.3869924545288086, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.18987053632736206, + "step": 9270 + }, + { + "epoch": 3.095396931287525, + "loss": 0.9799, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "grad_norm": 2.3767271041870117, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "learning_rate": 0.0006469752790633218, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 1.0870213508605957, + "step": 9280 + }, + { + "ce_loss": 0.29317188262939453, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.43251579999923706, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.17761041224002838, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 0.8081312775611877, + "step": 9280 + }, + { + "ce_loss": 0.21595090627670288, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.3709900677204132, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.17402665317058563, + "step": 9280 + }, + { + "epoch": 3.0987324883255503, + "loss": 1.0143, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "grad_norm": 2.8004062175750732, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "learning_rate": 0.0006466352123497565, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 1.0892130136489868, + "step": 9290 + }, + { + "ce_loss": 0.24042299389839172, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.4947637915611267, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.21145179867744446, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.995558500289917, + "step": 9290 + }, + { + "ce_loss": 0.2777062654495239, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.4128227233886719, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.22341780364513397, + "step": 9290 + }, + { + "epoch": 3.1020680453635756, + "loss": 1.0309, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "grad_norm": 1.9543800354003906, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "learning_rate": 0.0006462948577863593, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 1.1493991613388062, + "step": 9300 + }, + { + "ce_loss": 0.2970648407936096, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.4186588227748871, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.21933016180992126, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 1.5194854736328125, + "step": 9300 + }, + { + "ce_loss": 0.2752871513366699, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.480773001909256, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.1940537542104721, + "step": 9300 + }, + { + "epoch": 3.105403602401601, + "loss": 0.9941, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "grad_norm": 2.3288204669952393, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "learning_rate": 0.0006459542157703608, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.8614883422851562, + "step": 9310 + }, + { + "ce_loss": 0.20192290842533112, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.4057275652885437, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.18818989396095276, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.9263197779655457, + "step": 9310 + }, + { + "ce_loss": 0.2823805510997772, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.3929421603679657, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.19806356728076935, + "step": 9310 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.9622, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "grad_norm": 2.14695405960083, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "learning_rate": 0.0006456132866993266, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 1.4668676853179932, + "step": 9320 + }, + { + "ce_loss": 0.30181676149368286, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.40018582344055176, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.25091373920440674, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.9198508262634277, + "step": 9320 + }, + { + "ce_loss": 0.2544313073158264, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.3781255781650543, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.22032621502876282, + "step": 9320 + }, + { + "epoch": 3.1120747164776517, + "loss": 1.082, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "grad_norm": 1.727124810218811, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "learning_rate": 0.0006452720709711578, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 1.3362449407577515, + "step": 9330 + }, + { + "ce_loss": 0.2979918420314789, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.36880090832710266, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.24092470109462738, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.8599693775177002, + "step": 9330 + }, + { + "ce_loss": 0.24267035722732544, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.40098074078559875, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.14831340312957764, + "step": 9330 + }, + { + "epoch": 3.115410273515677, + "loss": 0.9737, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "grad_norm": 2.032933473587036, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "learning_rate": 0.0006449305689840898, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.960468590259552, + "step": 9340 + }, + { + "ce_loss": 0.24077749252319336, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.4729737341403961, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.19689474999904633, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.8800736665725708, + "step": 9340 + }, + { + "ce_loss": 0.23090268671512604, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.4070270359516144, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.18194246292114258, + "step": 9340 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.9708, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "grad_norm": 2.4061787128448486, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "learning_rate": 0.0006445887811366922, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.956256628036499, + "step": 9350 + }, + { + "ce_loss": 0.2822020649909973, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.4642751216888428, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.209703266620636, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 1.186940312385559, + "step": 9350 + }, + { + "ce_loss": 0.3359937369823456, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.46250587701797485, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.26884305477142334, + "step": 9350 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.9747, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "grad_norm": 1.8572888374328613, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "learning_rate": 0.000644246707827868, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 1.1304075717926025, + "step": 9360 + }, + { + "ce_loss": 0.32038217782974243, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.49822553992271423, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.23027847707271576, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 1.209761142730713, + "step": 9360 + }, + { + "ce_loss": 0.31924429535865784, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.5770853757858276, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.2517629861831665, + "step": 9360 + }, + { + "epoch": 3.125416944629753, + "loss": 1.0919, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "grad_norm": 2.1829588413238525, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "learning_rate": 0.0006439043494568539, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.984476625919342, + "step": 9370 + }, + { + "ce_loss": 0.2882232666015625, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.4718204140663147, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.22438225150108337, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.8405330181121826, + "step": 9370 + }, + { + "ce_loss": 0.2212357521057129, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.4073556959629059, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.16576293110847473, + "step": 9370 + }, + { + "epoch": 3.1287525016677784, + "loss": 1.049, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "grad_norm": 2.646833896636963, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "learning_rate": 0.0006435617064232187, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 1.037705898284912, + "step": 9380 + }, + { + "ce_loss": 0.242634579539299, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.448700487613678, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.2192038893699646, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 1.039435863494873, + "step": 9380 + }, + { + "ce_loss": 0.32878270745277405, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.4557130038738251, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.22904300689697266, + "step": 9380 + }, + { + "epoch": 3.1320880587058038, + "loss": 1.0306, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "grad_norm": 2.9341678619384766, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "learning_rate": 0.0006432187791268639, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 1.1417816877365112, + "step": 9390 + }, + { + "ce_loss": 0.22817549109458923, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.38718181848526, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.1807437539100647, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.8131179213523865, + "step": 9390 + }, + { + "ce_loss": 0.24570348858833313, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.37476658821105957, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.19250284135341644, + "step": 9390 + }, + { + "epoch": 3.135423615743829, + "loss": 1.0896, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "grad_norm": 3.520961046218872, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "learning_rate": 0.0006428755679680224, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.8953517079353333, + "step": 9400 + }, + { + "ce_loss": 0.20580261945724487, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.3411506116390228, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.17772138118743896, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.9172338247299194, + "step": 9400 + }, + { + "ce_loss": 0.263776957988739, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.37165242433547974, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.1233987957239151, + "step": 9400 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.9324, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "grad_norm": 1.602327585220337, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "learning_rate": 0.0006425320733472585, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.6967126727104187, + "step": 9410 + }, + { + "ce_loss": 0.21138957142829895, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.29559966921806335, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.14123915135860443, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 1.1047972440719604, + "step": 9410 + }, + { + "ce_loss": 0.26694992184638977, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.35893869400024414, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.22025825083255768, + "step": 9410 + }, + { + "epoch": 3.14209472981988, + "loss": 0.9499, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "grad_norm": 1.3611681461334229, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "learning_rate": 0.0006421882956654676, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 0.9599966406822205, + "step": 9420 + }, + { + "ce_loss": 0.2786065936088562, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.3393070697784424, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.20848676562309265, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 1.2619997262954712, + "step": 9420 + }, + { + "ce_loss": 0.29320430755615234, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.4119172692298889, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.23351138830184937, + "step": 9420 + }, + { + "epoch": 3.145430286857905, + "loss": 1.0658, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "grad_norm": 1.6318954229354858, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "learning_rate": 0.000641844235323875, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.9952999353408813, + "step": 9430 + }, + { + "ce_loss": 0.2627035081386566, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.43587803840637207, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.22304628789424896, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.8246051073074341, + "step": 9430 + }, + { + "ce_loss": 0.20812024176120758, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.3872748911380768, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.18635542690753937, + "step": 9430 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.9934, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "grad_norm": 1.9390169382095337, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "learning_rate": 0.0006414998927240363, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.9947294592857361, + "step": 9440 + }, + { + "ce_loss": 0.3259407579898834, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.44335800409317017, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.18987657129764557, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.6140831708908081, + "step": 9440 + }, + { + "ce_loss": 0.16870209574699402, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.2950131297111511, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.15002664923667908, + "step": 9440 + }, + { + "epoch": 3.152101400933956, + "loss": 0.9927, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "grad_norm": 3.063666582107544, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "learning_rate": 0.0006411552682678365, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 1.0572447776794434, + "step": 9450 + }, + { + "ce_loss": 0.3054739236831665, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.43969228863716125, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.24192282557487488, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 0.8156782388687134, + "step": 9450 + }, + { + "ce_loss": 0.23238930106163025, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.33370551466941833, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.19612304866313934, + "step": 9450 + }, + { + "epoch": 3.155436957971981, + "loss": 0.965, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "grad_norm": 3.6377336978912354, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "learning_rate": 0.0006408103623574891, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 0.7613427639007568, + "step": 9460 + }, + { + "ce_loss": 0.2139458805322647, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.2921532392501831, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.20600202679634094, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 1.1687495708465576, + "step": 9460 + }, + { + "ce_loss": 0.27339303493499756, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.39343589544296265, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.25108566880226135, + "step": 9460 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.97, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "grad_norm": 2.286971092224121, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "learning_rate": 0.0006404651753955363, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.9634687304496765, + "step": 9470 + }, + { + "ce_loss": 0.30855584144592285, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.3636634647846222, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.2440856397151947, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 1.6036324501037598, + "step": 9470 + }, + { + "ce_loss": 0.3264394998550415, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.4360540509223938, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.2085307240486145, + "step": 9470 + }, + { + "epoch": 3.162108072048032, + "loss": 1.0855, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "grad_norm": 2.486076593399048, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "learning_rate": 0.000640119707784849, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.7592119574546814, + "step": 9480 + }, + { + "ce_loss": 0.19020523130893707, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.3947354555130005, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.17405526340007782, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.9845924973487854, + "step": 9480 + }, + { + "ce_loss": 0.24522338807582855, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.38083866238594055, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.21165917813777924, + "step": 9480 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.9108, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "grad_norm": 1.6602882146835327, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "learning_rate": 0.0006397739599286248, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.8489893674850464, + "step": 9490 + }, + { + "ce_loss": 0.22476591169834137, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.4076571464538574, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.1667369157075882, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 1.1844907999038696, + "step": 9490 + }, + { + "ce_loss": 0.24899810552597046, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.35540464520454407, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.20953817665576935, + "step": 9490 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.9954, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "grad_norm": 2.153787612915039, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "learning_rate": 0.0006394279322303885, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 1.1891601085662842, + "step": 9500 + }, + { + "ce_loss": 0.40403467416763306, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.5030407905578613, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.28187599778175354, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.8806698322296143, + "step": 9500 + }, + { + "ce_loss": 0.21907956898212433, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.4144791066646576, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.15799804031848907, + "step": 9500 + }, + { + "epoch": 3.172114743162108, + "loss": 1.0545, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "grad_norm": 1.8217953443527222, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "learning_rate": 0.0006390816250939918, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.9632741212844849, + "step": 9510 + }, + { + "ce_loss": 0.27603211998939514, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.45113474130630493, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.1889391541481018, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.881806492805481, + "step": 9510 + }, + { + "ce_loss": 0.20647741854190826, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.4694068133831024, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.18652670085430145, + "step": 9510 + }, + { + "epoch": 3.1754503002001333, + "loss": 1.0121, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "grad_norm": 1.8334593772888184, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "learning_rate": 0.0006387350389236124, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.9273632168769836, + "step": 9520 + }, + { + "ce_loss": 0.2923687696456909, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.4269871115684509, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.20764534175395966, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.6492568850517273, + "step": 9520 + }, + { + "ce_loss": 0.1861332356929779, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.3245824873447418, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.1382860392332077, + "step": 9520 + }, + { + "epoch": 3.1787858572381587, + "loss": 1.068, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "grad_norm": 1.825934886932373, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "learning_rate": 0.0006383881741237535, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.9091103672981262, + "step": 9530 + }, + { + "ce_loss": 0.2934170663356781, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.3609623312950134, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.19794169068336487, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.8751694560050964, + "step": 9530 + }, + { + "ce_loss": 0.2098688930273056, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.37939339876174927, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.198336660861969, + "step": 9530 + }, + { + "epoch": 3.182121414276184, + "loss": 1.1017, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "grad_norm": 3.3589539527893066, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "learning_rate": 0.0006380410310992438, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 1.1012458801269531, + "step": 9540 + }, + { + "ce_loss": 0.29811060428619385, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.4516919255256653, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.26902395486831665, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 0.8544487953186035, + "step": 9540 + }, + { + "ce_loss": 0.19948647916316986, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.3371371328830719, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.1422787457704544, + "step": 9540 + }, + { + "epoch": 3.1854569713142094, + "loss": 1.0451, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "grad_norm": 1.9161041975021362, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "learning_rate": 0.0006376936102552368, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.7991788387298584, + "step": 9550 + }, + { + "ce_loss": 0.22001229226589203, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.32813096046447754, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.16685843467712402, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.8779180645942688, + "step": 9550 + }, + { + "ce_loss": 0.24932724237442017, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.37945684790611267, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.20249043405056, + "step": 9550 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.9951, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "grad_norm": 1.8126941919326782, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "learning_rate": 0.0006373459119972095, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.7515552043914795, + "step": 9560 + }, + { + "ce_loss": 0.17431022226810455, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.32552477717399597, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.15542994439601898, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.9562395215034485, + "step": 9560 + }, + { + "ce_loss": 0.30415454506874084, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.40889936685562134, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.24302524328231812, + "step": 9560 + }, + { + "epoch": 3.19212808539026, + "loss": 0.961, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "grad_norm": 2.174283266067505, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "learning_rate": 0.0006369979367309635, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 0.9469784498214722, + "step": 9570 + }, + { + "ce_loss": 0.2929200828075409, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.4416431486606598, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.21219401061534882, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 1.0700886249542236, + "step": 9570 + }, + { + "ce_loss": 0.25346022844314575, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.36397501826286316, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.22573909163475037, + "step": 9570 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.9896, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "grad_norm": 2.4014036655426025, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "learning_rate": 0.0006366496848626232, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 1.5717496871948242, + "step": 9580 + }, + { + "ce_loss": 0.257915735244751, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.33823972940444946, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.16347834467887878, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 1.0040936470031738, + "step": 9580 + }, + { + "ce_loss": 0.2746289074420929, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.37868228554725647, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.1852872520685196, + "step": 9580 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.936, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "grad_norm": 1.3955729007720947, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "learning_rate": 0.0006363011567986361, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.7811391949653625, + "step": 9590 + }, + { + "ce_loss": 0.22975607216358185, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.3405916094779968, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.16584502160549164, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 1.0799206495285034, + "step": 9590 + }, + { + "ce_loss": 0.2468085139989853, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.32198700308799744, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.17793063819408417, + "step": 9590 + }, + { + "epoch": 3.202134756504336, + "loss": 1.0321, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "grad_norm": 2.6115453243255615, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "learning_rate": 0.000635952352945772, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.9059250354766846, + "step": 9600 + }, + { + "ce_loss": 0.23575346171855927, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.4092578887939453, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.21158456802368164, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.9089540839195251, + "step": 9600 + }, + { + "ce_loss": 0.2593563497066498, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.4364529550075531, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.1588144451379776, + "step": 9600 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.9798, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "grad_norm": 2.0719449520111084, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "learning_rate": 0.0006356032737111226, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.967341423034668, + "step": 9610 + }, + { + "ce_loss": 0.25755563378334045, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.38610100746154785, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.18525567650794983, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.9186158180236816, + "step": 9610 + }, + { + "ce_loss": 0.19721673429012299, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.4046347737312317, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.18537798523902893, + "step": 9610 + }, + { + "epoch": 3.208805870580387, + "loss": 1.0076, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "grad_norm": 4.249617576599121, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "learning_rate": 0.0006352539195021007, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 0.931041955947876, + "step": 9620 + }, + { + "ce_loss": 0.23878906667232513, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.4049624502658844, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.2291223406791687, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 1.1312953233718872, + "step": 9620 + }, + { + "ce_loss": 0.16920793056488037, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.4426681399345398, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.17689041793346405, + "step": 9620 + }, + { + "epoch": 3.212141427618412, + "loss": 0.959, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "grad_norm": 2.485501527786255, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "learning_rate": 0.0006349042907264404, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.7932411432266235, + "step": 9630 + }, + { + "ce_loss": 0.2088746428489685, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.38120004534721375, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.15612326562404633, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.9963729381561279, + "step": 9630 + }, + { + "ce_loss": 0.2870686948299408, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.4551391005516052, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.21353857219219208, + "step": 9630 + }, + { + "epoch": 3.2154769846564375, + "loss": 1.0746, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "grad_norm": 2.1060545444488525, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "learning_rate": 0.0006345543877921961, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.9149405360221863, + "step": 9640 + }, + { + "ce_loss": 0.26410815119743347, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.44474872946739197, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.16752924025058746, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 1.069939374923706, + "step": 9640 + }, + { + "ce_loss": 0.2878960967063904, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.4400632381439209, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.18491016328334808, + "step": 9640 + }, + { + "epoch": 3.218812541694463, + "loss": 1.0146, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "grad_norm": 1.780337929725647, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "learning_rate": 0.000634204211107742, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 0.9347214698791504, + "step": 9650 + }, + { + "ce_loss": 0.29288166761398315, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.3937554955482483, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.1980094611644745, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 0.8976361751556396, + "step": 9650 + }, + { + "ce_loss": 0.25129374861717224, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.4056212306022644, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.1826409250497818, + "step": 9650 + }, + { + "epoch": 3.222148098732488, + "loss": 1.02, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "grad_norm": 1.546045184135437, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "learning_rate": 0.0006338537610817722, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 0.8476258516311646, + "step": 9660 + }, + { + "ce_loss": 0.19795475900173187, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.43406128883361816, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.16292031109333038, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 1.0019570589065552, + "step": 9660 + }, + { + "ce_loss": 0.3020380735397339, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.44856947660446167, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.20614181458950043, + "step": 9660 + }, + { + "epoch": 3.2254836557705135, + "loss": 1.0188, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "grad_norm": 2.204603672027588, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "learning_rate": 0.0006335030381232998, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 1.1082043647766113, + "step": 9670 + }, + { + "ce_loss": 0.2776288688182831, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 0.5517551898956299, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.22000828385353088, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 0.9066957235336304, + "step": 9670 + }, + { + "ce_loss": 0.19364385306835175, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 0.46161043643951416, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.1524243801832199, + "step": 9670 + }, + { + "epoch": 3.228819212808539, + "loss": 1.1544, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "grad_norm": 2.8075547218322754, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "learning_rate": 0.0006331520426416556, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 0.9625744223594666, + "step": 9680 + }, + { + "ce_loss": 0.22598817944526672, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.4440693259239197, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.1613425761461258, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 1.153572678565979, + "step": 9680 + }, + { + "ce_loss": 0.2614941895008087, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.434669554233551, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.22456280887126923, + "step": 9680 + }, + { + "epoch": 3.2321547698465642, + "loss": 1.0116, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "grad_norm": 1.9188731908798218, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "learning_rate": 0.0006328007750464895, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.9416834115982056, + "step": 9690 + }, + { + "ce_loss": 0.2584744095802307, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.4141307473182678, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.19777798652648926, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.8339923620223999, + "step": 9690 + }, + { + "ce_loss": 0.239766925573349, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.4033082127571106, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.1904684603214264, + "step": 9690 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.9304, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "grad_norm": 2.2980542182922363, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "learning_rate": 0.0006324492357477685, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.6538512706756592, + "step": 9700 + }, + { + "ce_loss": 0.15079143643379211, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.3154073655605316, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.13001316785812378, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 1.2124816179275513, + "step": 9700 + }, + { + "ce_loss": 0.27233535051345825, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.4675295948982239, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.22750096023082733, + "step": 9700 + }, + { + "epoch": 3.238825883922615, + "loss": 1.032, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "grad_norm": 2.910517454147339, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "learning_rate": 0.0006320974251557769, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 1.0354328155517578, + "step": 9710 + }, + { + "ce_loss": 0.27194535732269287, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.3568897247314453, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.21175265312194824, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 1.016433835029602, + "step": 9710 + }, + { + "ce_loss": 0.244206964969635, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.383299320936203, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.22494517266750336, + "step": 9710 + }, + { + "epoch": 3.2421614409606403, + "loss": 1.0236, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "grad_norm": 2.3274762630462646, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "learning_rate": 0.0006317453436811154, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 1.0498437881469727, + "step": 9720 + }, + { + "ce_loss": 0.3165692687034607, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.44313669204711914, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.22029148042201996, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.939366340637207, + "step": 9720 + }, + { + "ce_loss": 0.23943881690502167, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.34780487418174744, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.197583869099617, + "step": 9720 + }, + { + "epoch": 3.2454969979986656, + "loss": 1.0895, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "grad_norm": 3.5488083362579346, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "learning_rate": 0.0006313929917347011, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.9897884130477905, + "step": 9730 + }, + { + "ce_loss": 0.2667106091976166, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.5222178101539612, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.20061685144901276, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.8107883930206299, + "step": 9730 + }, + { + "ce_loss": 0.20364002883434296, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.3627731204032898, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.1910681426525116, + "step": 9730 + }, + { + "epoch": 3.248832555036691, + "loss": 1.003, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "grad_norm": 2.29548978805542, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "learning_rate": 0.0006310403697277663, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 1.0931097269058228, + "step": 9740 + }, + { + "ce_loss": 0.29702019691467285, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.45877814292907715, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.2571999728679657, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 1.3060147762298584, + "step": 9740 + }, + { + "ce_loss": 0.2924763262271881, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.46052947640419006, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.2098548263311386, + "step": 9740 + }, + { + "epoch": 3.2521681120747163, + "loss": 1.0576, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "grad_norm": 1.9056835174560547, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "learning_rate": 0.0006306874780718593, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.9245695471763611, + "step": 9750 + }, + { + "ce_loss": 0.3011467158794403, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.3892473578453064, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.1900477558374405, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 1.0070749521255493, + "step": 9750 + }, + { + "ce_loss": 0.2565157115459442, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.44015613198280334, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.2389518767595291, + "step": 9750 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.949, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "grad_norm": 1.44841468334198, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "learning_rate": 0.0006303343171788422, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.9746924042701721, + "step": 9760 + }, + { + "ce_loss": 0.3471539318561554, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.3837020993232727, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.19318555295467377, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.8271773457527161, + "step": 9760 + }, + { + "ce_loss": 0.24171385169029236, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.34053492546081543, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.18620966374874115, + "step": 9760 + }, + { + "epoch": 3.258839226150767, + "loss": 0.9802, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "grad_norm": 1.7812227010726929, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "learning_rate": 0.0006299808874608919, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.8848695755004883, + "step": 9770 + }, + { + "ce_loss": 0.2525423467159271, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.39496883749961853, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.1729382425546646, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.7401137948036194, + "step": 9770 + }, + { + "ce_loss": 0.21388928592205048, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.30214762687683105, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.16249021887779236, + "step": 9770 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.9537, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "grad_norm": 1.8796312808990479, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "learning_rate": 0.0006296271893304992, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 1.426863193511963, + "step": 9780 + }, + { + "ce_loss": 0.306301474571228, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.3925962448120117, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.1583440750837326, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.8414177298545837, + "step": 9780 + }, + { + "ce_loss": 0.2551378905773163, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.33810290694236755, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.19451551139354706, + "step": 9780 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.9865, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "grad_norm": 1.5099884271621704, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "learning_rate": 0.0006292732232004675, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.8750608563423157, + "step": 9790 + }, + { + "ce_loss": 0.24614830315113068, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.40064993500709534, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.18194952607154846, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 1.3389601707458496, + "step": 9790 + }, + { + "ce_loss": 0.33404234051704407, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.5011653304100037, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.19928906857967377, + "step": 9790 + }, + { + "epoch": 3.268845897264843, + "loss": 0.9772, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "grad_norm": 1.6015758514404297, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "learning_rate": 0.0006289189894839135, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.7471339106559753, + "step": 9800 + }, + { + "ce_loss": 0.19947054982185364, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.33740973472595215, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.2094244807958603, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.9930317997932434, + "step": 9800 + }, + { + "ce_loss": 0.2759966552257538, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.4113280475139618, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.241688534617424, + "step": 9800 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.888, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "grad_norm": 1.7993124723434448, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "learning_rate": 0.0006285644885942661, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.8379071950912476, + "step": 9810 + }, + { + "ce_loss": 0.24445652961730957, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.32546156644821167, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.1814669668674469, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 1.4976379871368408, + "step": 9810 + }, + { + "ce_loss": 0.26668861508369446, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.37651145458221436, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.20988450944423676, + "step": 9810 + }, + { + "epoch": 3.275517011340894, + "loss": 0.982, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "grad_norm": 2.3544046878814697, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "learning_rate": 0.0006282097209452661, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 0.9182118773460388, + "step": 9820 + }, + { + "ce_loss": 0.23315070569515228, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.4567936062812805, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.18071922659873962, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 1.4837809801101685, + "step": 9820 + }, + { + "ce_loss": 0.2943566143512726, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.46521204710006714, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.24306204915046692, + "step": 9820 + }, + { + "epoch": 3.278852568378919, + "loss": 0.9991, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "grad_norm": 1.9920049905776978, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "learning_rate": 0.0006278546869509651, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 1.4693489074707031, + "step": 9830 + }, + { + "ce_loss": 0.24498344957828522, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.456991046667099, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.1735164374113083, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 0.9276065230369568, + "step": 9830 + }, + { + "ce_loss": 0.2843460738658905, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.43910086154937744, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.20382851362228394, + "step": 9830 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.9999, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "grad_norm": 1.3909965753555298, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "learning_rate": 0.0006274993870257265, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.9617606401443481, + "step": 9840 + }, + { + "ce_loss": 0.23345062136650085, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.37801143527030945, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.20465831458568573, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.869107186794281, + "step": 9840 + }, + { + "ce_loss": 0.2557021379470825, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.42464479804039, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.15845350921154022, + "step": 9840 + }, + { + "epoch": 3.28552368245497, + "loss": 1.0277, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "grad_norm": 2.674233913421631, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "learning_rate": 0.000627143821584223, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 1.193628191947937, + "step": 9850 + }, + { + "ce_loss": 0.33579155802726746, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.48818734288215637, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.24845334887504578, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 0.9142537713050842, + "step": 9850 + }, + { + "ce_loss": 0.23797516524791718, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.40407562255859375, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.18777459859848022, + "step": 9850 + }, + { + "epoch": 3.288859239492995, + "loss": 1.088, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "grad_norm": 1.5202012062072754, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "learning_rate": 0.0006267879910414383, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 1.2795214653015137, + "step": 9860 + }, + { + "ce_loss": 0.3531726896762848, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.5422238111495972, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.21500453352928162, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 0.6902878880500793, + "step": 9860 + }, + { + "ce_loss": 0.16109101474285126, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.34586697816848755, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.18270200490951538, + "step": 9860 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.9348, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "grad_norm": 1.827506184577942, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "learning_rate": 0.0006264318958126645, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.8399723172187805, + "step": 9870 + }, + { + "ce_loss": 0.2216285616159439, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.3668162524700165, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.1240381971001625, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 1.3451710939407349, + "step": 9870 + }, + { + "ce_loss": 0.4018022119998932, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.47588586807250977, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.2462722659111023, + "step": 9870 + }, + { + "epoch": 3.295530353569046, + "loss": 1.0435, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "grad_norm": 1.4911365509033203, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "learning_rate": 0.0006260755363135033, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.9486567974090576, + "step": 9880 + }, + { + "ce_loss": 0.22665971517562866, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.45116129517555237, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.1920081079006195, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.8320479989051819, + "step": 9880 + }, + { + "ce_loss": 0.1915288269519806, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.33396807312965393, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.18080615997314453, + "step": 9880 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.9529, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "grad_norm": 1.6564595699310303, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "learning_rate": 0.0006257189129598645, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.7733842730522156, + "step": 9890 + }, + { + "ce_loss": 0.22635827958583832, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.3659568428993225, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.1503317505121231, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.8225291967391968, + "step": 9890 + }, + { + "ce_loss": 0.24125638604164124, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.3937651515007019, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.18737755715847015, + "step": 9890 + }, + { + "epoch": 3.3022014676450966, + "loss": 1.0092, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "grad_norm": 2.3174591064453125, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "learning_rate": 0.0006253620261679659, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 1.4922536611557007, + "step": 9900 + }, + { + "ce_loss": 0.2831789255142212, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.4154704511165619, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.23491549491882324, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 0.7731677889823914, + "step": 9900 + }, + { + "ce_loss": 0.18771180510520935, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.3617543578147888, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.1634535938501358, + "step": 9900 + }, + { + "epoch": 3.305537024683122, + "loss": 1.0277, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "grad_norm": 1.5574588775634766, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "learning_rate": 0.0006250048763543326, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.6726523637771606, + "step": 9910 + }, + { + "ce_loss": 0.15559059381484985, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.34071028232574463, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.17615939676761627, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.7978571653366089, + "step": 9910 + }, + { + "ce_loss": 0.24486663937568665, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.3446405529975891, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.2081782966852188, + "step": 9910 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.9791, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "grad_norm": 3.7845053672790527, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "learning_rate": 0.0006246474639357973, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 1.7719227075576782, + "step": 9920 + }, + { + "ce_loss": 0.33106061816215515, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.4632108211517334, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.24532441794872284, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.9917610883712769, + "step": 9920 + }, + { + "ce_loss": 0.1957157701253891, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.31435099244117737, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.17546167969703674, + "step": 9920 + }, + { + "epoch": 3.3122081387591726, + "loss": 1.0102, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "grad_norm": 2.843688726425171, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "learning_rate": 0.0006242897893294984, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 1.0776625871658325, + "step": 9930 + }, + { + "ce_loss": 0.25643882155418396, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.3841439187526703, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.23373252153396606, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 1.0383449792861938, + "step": 9930 + }, + { + "ce_loss": 0.32289958000183105, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.43312522768974304, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.21836471557617188, + "step": 9930 + }, + { + "epoch": 3.315543695797198, + "loss": 1.0612, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "grad_norm": 2.5459680557250977, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "learning_rate": 0.000623931852952881, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.7622495889663696, + "step": 9940 + }, + { + "ce_loss": 0.21401986479759216, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.3269309997558594, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.18412983417510986, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.906096339225769, + "step": 9940 + }, + { + "ce_loss": 0.2815874516963959, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.36980006098747253, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.23133040964603424, + "step": 9940 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.991, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "grad_norm": 1.8961730003356934, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "learning_rate": 0.000623573655223695, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.9649955630302429, + "step": 9950 + }, + { + "ce_loss": 0.2432449609041214, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.46102091670036316, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.21075601875782013, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 1.0744799375534058, + "step": 9950 + }, + { + "ce_loss": 0.2550390362739563, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.39716631174087524, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.22736836969852448, + "step": 9950 + }, + { + "epoch": 3.3222148098732487, + "loss": 1.0089, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "grad_norm": 1.9671109914779663, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "learning_rate": 0.0006232151965599956, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 1.4595311880111694, + "step": 9960 + }, + { + "ce_loss": 0.2657226026058197, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.4428955614566803, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.20473840832710266, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 0.846092700958252, + "step": 9960 + }, + { + "ce_loss": 0.19718728959560394, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.3793059289455414, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.2034681737422943, + "step": 9960 + }, + { + "epoch": 3.325550366911274, + "loss": 1.038, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "grad_norm": 1.7723923921585083, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "learning_rate": 0.0006228564773801431, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.8382503986358643, + "step": 9970 + }, + { + "ce_loss": 0.25603431463241577, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.3779396712779999, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.20399893820285797, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.7795833349227905, + "step": 9970 + }, + { + "ce_loss": 0.20238620042800903, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.363075852394104, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.17283816635608673, + "step": 9970 + }, + { + "epoch": 3.3288859239492994, + "loss": 1.0055, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "grad_norm": 1.9274988174438477, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "learning_rate": 0.0006224974981028012, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.7052872776985168, + "step": 9980 + }, + { + "ce_loss": 0.19602438807487488, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.3226252794265747, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.15171216428279877, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.9317882657051086, + "step": 9980 + }, + { + "ce_loss": 0.29486575722694397, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.4522726237773895, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.18425706028938293, + "step": 9980 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.9846, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "grad_norm": 2.507467746734619, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "learning_rate": 0.0006221382591469371, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.9769386053085327, + "step": 9990 + }, + { + "ce_loss": 0.30808940529823303, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.40864020586013794, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.21908801794052124, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 1.0252798795700073, + "step": 9990 + }, + { + "ce_loss": 0.17562223970890045, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.3085121214389801, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.19351007044315338, + "step": 9990 + }, + { + "epoch": 3.33555703802535, + "loss": 0.9897, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "grad_norm": 2.2838590145111084, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "learning_rate": 0.0006217787609318217, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.8830081820487976, + "step": 10000 + }, + { + "ce_loss": 0.21135838329792023, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.42949768900871277, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.18174947798252106, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.8349977731704712, + "step": 10000 + }, + { + "ce_loss": 0.21382983028888702, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.3579995632171631, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.20931722223758698, + "step": 10000 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.9685, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "grad_norm": 1.8487814664840698, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "learning_rate": 0.0006214190038770278, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 1.5042794942855835, + "step": 10010 + }, + { + "ce_loss": 0.2764376997947693, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.43299469351768494, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.21280337870121002, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.9258934259414673, + "step": 10010 + }, + { + "ce_loss": 0.25367021560668945, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.34598594903945923, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.20186175405979156, + "step": 10010 + }, + { + "epoch": 3.342228152101401, + "loss": 1.023, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "grad_norm": 2.127298355102539, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "learning_rate": 0.0006210589884024307, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 1.0435254573822021, + "step": 10020 + }, + { + "ce_loss": 0.2094258964061737, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.3154853582382202, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.19164526462554932, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 0.9915556907653809, + "step": 10020 + }, + { + "ce_loss": 0.2046308070421219, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.41096386313438416, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.14242751896381378, + "step": 10020 + }, + { + "epoch": 3.345563709139426, + "loss": 1.0016, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "grad_norm": 2.440781831741333, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "learning_rate": 0.0006206987149282073, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 1.0040383338928223, + "step": 10030 + }, + { + "ce_loss": 0.2689066231250763, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.48362284898757935, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.20056311786174774, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 0.9044297337532043, + "step": 10030 + }, + { + "ce_loss": 0.21311141550540924, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.48758459091186523, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.2009950578212738, + "step": 10030 + }, + { + "epoch": 3.3488992661774515, + "loss": 1.0939, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "grad_norm": 2.704503297805786, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "learning_rate": 0.0006203381838748353, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.9915063977241516, + "step": 10040 + }, + { + "ce_loss": 0.24685484170913696, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.4709896743297577, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.2363121062517166, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 1.0160868167877197, + "step": 10040 + }, + { + "ce_loss": 0.2190462350845337, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.49053841829299927, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.16573746502399445, + "step": 10040 + }, + { + "epoch": 3.352234823215477, + "loss": 1.0007, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "grad_norm": 2.394056797027588, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "learning_rate": 0.0006199773956630934, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.9801971912384033, + "step": 10050 + }, + { + "ce_loss": 0.2730773389339447, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.3858831822872162, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.19404278695583344, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.8788453936576843, + "step": 10050 + }, + { + "ce_loss": 0.2635515332221985, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.38254135847091675, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.17534977197647095, + "step": 10050 + }, + { + "epoch": 3.355570380253502, + "loss": 1.001, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "grad_norm": 3.505295753479004, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "learning_rate": 0.0006196163507140602, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.7710726261138916, + "step": 10060 + }, + { + "ce_loss": 0.19496233761310577, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.38631999492645264, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.18970170617103577, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.8161435127258301, + "step": 10060 + }, + { + "ce_loss": 0.21463973820209503, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.42355772852897644, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.17776453495025635, + "step": 10060 + }, + { + "epoch": 3.3589059372915275, + "loss": 1.0532, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "grad_norm": 1.9118040800094604, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "learning_rate": 0.000619255049449114, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.8997465372085571, + "step": 10070 + }, + { + "ce_loss": 0.23062358796596527, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.39321815967559814, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.2236202508211136, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.9231383800506592, + "step": 10070 + }, + { + "ce_loss": 0.2514480948448181, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.3488033711910248, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.19309905171394348, + "step": 10070 + }, + { + "epoch": 3.362241494329553, + "loss": 0.9295, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "grad_norm": 2.3443384170532227, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "learning_rate": 0.0006188934922899324, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.8065698146820068, + "step": 10080 + }, + { + "ce_loss": 0.24393002688884735, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.33297401666641235, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.18890532851219177, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.9825599789619446, + "step": 10080 + }, + { + "ce_loss": 0.2975650727748871, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.4654487371444702, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.1721816211938858, + "step": 10080 + }, + { + "epoch": 3.3655770513675782, + "loss": 1.0455, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "grad_norm": 3.9115467071533203, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "learning_rate": 0.0006185316796584912, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 1.1405165195465088, + "step": 10090 + }, + { + "ce_loss": 0.2993922531604767, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.46894127130508423, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.18077045679092407, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.930509090423584, + "step": 10090 + }, + { + "ce_loss": 0.21698082983493805, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.44211745262145996, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.17781756818294525, + "step": 10090 + }, + { + "epoch": 3.3689126084056036, + "loss": 1.0448, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "grad_norm": 1.9488577842712402, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "learning_rate": 0.0006181696119770651, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 1.8516427278518677, + "step": 10100 + }, + { + "ce_loss": 0.2993837296962738, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.45562267303466797, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.18279919028282166, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 0.956829309463501, + "step": 10100 + }, + { + "ce_loss": 0.22248753905296326, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.4272536635398865, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.15380191802978516, + "step": 10100 + }, + { + "epoch": 3.372248165443629, + "loss": 1.0941, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "grad_norm": 2.792809009552002, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "learning_rate": 0.0006178072896682257, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 1.0727574825286865, + "step": 10110 + }, + { + "ce_loss": 0.2581041753292084, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.47537761926651, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.17899997532367706, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 0.779070258140564, + "step": 10110 + }, + { + "ce_loss": 0.24334947764873505, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.3363867402076721, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.15977618098258972, + "step": 10110 + }, + { + "epoch": 3.3755837224816543, + "loss": 1.0354, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "grad_norm": 1.8961207866668701, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "learning_rate": 0.0006174447131548421, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.9844977259635925, + "step": 10120 + }, + { + "ce_loss": 0.3068873882293701, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.4935329854488373, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.1830204725265503, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.8539527654647827, + "step": 10120 + }, + { + "ce_loss": 0.27278390526771545, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.3903404176235199, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.19060420989990234, + "step": 10120 + }, + { + "epoch": 3.3789192795196796, + "loss": 1.0105, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "grad_norm": 2.0325279235839844, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "learning_rate": 0.0006170818828600802, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.8580547571182251, + "step": 10130 + }, + { + "ce_loss": 0.24313877522945404, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.306068480014801, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.23506256937980652, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.7390968203544617, + "step": 10130 + }, + { + "ce_loss": 0.17966081202030182, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.309879332780838, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.18670016527175903, + "step": 10130 + }, + { + "epoch": 3.382254836557705, + "loss": 1.0019, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "grad_norm": 2.281717538833618, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "learning_rate": 0.0006167187992074021, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 0.6992474794387817, + "step": 10140 + }, + { + "ce_loss": 0.15938930213451385, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.36157307028770447, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.17812681198120117, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 1.0849311351776123, + "step": 10140 + }, + { + "ce_loss": 0.23215362429618835, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.4448317885398865, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.20249246060848236, + "step": 10140 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.9154, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "grad_norm": 1.4965434074401855, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "learning_rate": 0.0006163554626205655, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.7812495827674866, + "step": 10150 + }, + { + "ce_loss": 0.255270391702652, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.3302749693393707, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.19548340141773224, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.9711526036262512, + "step": 10150 + }, + { + "ce_loss": 0.31296950578689575, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.38431110978126526, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.19149935245513916, + "step": 10150 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.8598, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "grad_norm": 2.4595344066619873, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "learning_rate": 0.0006159918735236232, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 1.1318943500518799, + "step": 10160 + }, + { + "ce_loss": 0.19478745758533478, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.42160463333129883, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.17801080644130707, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 1.0755106210708618, + "step": 10160 + }, + { + "ce_loss": 0.33530953526496887, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.4514789581298828, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.22947563230991364, + "step": 10160 + }, + { + "epoch": 3.392261507671781, + "loss": 1.0222, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "grad_norm": 2.17761492729187, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "learning_rate": 0.0006156280323409227, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 1.1920362710952759, + "step": 10170 + }, + { + "ce_loss": 0.2914145588874817, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.4543791711330414, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.27146637439727783, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 0.8673732280731201, + "step": 10170 + }, + { + "ce_loss": 0.280834436416626, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.42488738894462585, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.16155914962291718, + "step": 10170 + }, + { + "epoch": 3.3955970647098064, + "loss": 1.0552, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "grad_norm": 1.9043627977371216, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "learning_rate": 0.000615263939497106, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.870008111000061, + "step": 10180 + }, + { + "ce_loss": 0.2507705092430115, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.3454810678958893, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.18389473855495453, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 1.1123414039611816, + "step": 10180 + }, + { + "ce_loss": 0.2997893691062927, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.4631223976612091, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.2144378125667572, + "step": 10180 + }, + { + "epoch": 3.3989326217478317, + "loss": 1.0243, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "grad_norm": 2.3104279041290283, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "learning_rate": 0.0006148995954171084, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.8569172620773315, + "step": 10190 + }, + { + "ce_loss": 0.24644263088703156, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.39549142122268677, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.18206985294818878, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 1.314200758934021, + "step": 10190 + }, + { + "ce_loss": 0.30596616864204407, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.37953469157218933, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.24160854518413544, + "step": 10190 + }, + { + "epoch": 3.402268178785857, + "loss": 0.9644, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "grad_norm": 1.7902569770812988, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "learning_rate": 0.000614535000526159, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 0.8502125144004822, + "step": 10200 + }, + { + "ce_loss": 0.20533131062984467, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.4242219924926758, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.1732807606458664, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 1.1380047798156738, + "step": 10200 + }, + { + "ce_loss": 0.2311955988407135, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.3482438623905182, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.17975008487701416, + "step": 10200 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.9528, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "grad_norm": 2.735464096069336, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "learning_rate": 0.000614170155249779, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.9691909551620483, + "step": 10210 + }, + { + "ce_loss": 0.2766101360321045, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.46005550026893616, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.17615088820457458, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 1.0173518657684326, + "step": 10210 + }, + { + "ce_loss": 0.2465779185295105, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.400485098361969, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.15484775602817535, + "step": 10210 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.958, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "grad_norm": 2.010167121887207, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "learning_rate": 0.0006138050600137822, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 1.0048978328704834, + "step": 10220 + }, + { + "ce_loss": 0.27049142122268677, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.46906766295433044, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.20254988968372345, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.975196361541748, + "step": 10220 + }, + { + "ce_loss": 0.22158262133598328, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.43164435029029846, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.17377683520317078, + "step": 10220 + }, + { + "epoch": 3.412274849899933, + "loss": 0.963, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "grad_norm": 2.933204174041748, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "learning_rate": 0.000613439715244274, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.7380354404449463, + "step": 10230 + }, + { + "ce_loss": 0.21490693092346191, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.3523274064064026, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.17062704265117645, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.9718881845474243, + "step": 10230 + }, + { + "ce_loss": 0.2706681489944458, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.407747358083725, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.16677233576774597, + "step": 10230 + }, + { + "epoch": 3.4156104069379585, + "loss": 1.0634, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "grad_norm": 3.1097564697265625, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "learning_rate": 0.0006130741213676509, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 1.0423635244369507, + "step": 10240 + }, + { + "ce_loss": 0.2795296907424927, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.3979673385620117, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.19955852627754211, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.8001618385314941, + "step": 10240 + }, + { + "ce_loss": 0.22154352068901062, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.33247750997543335, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.1758543998003006, + "step": 10240 + }, + { + "epoch": 3.418945963975984, + "loss": 0.9947, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "grad_norm": 1.672417402267456, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "learning_rate": 0.0006127082788106006, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.9995819330215454, + "step": 10250 + }, + { + "ce_loss": 0.22473253309726715, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.4424959719181061, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.18075767159461975, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.96095871925354, + "step": 10250 + }, + { + "ce_loss": 0.2543027997016907, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.4390068054199219, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.20820271968841553, + "step": 10250 + }, + { + "epoch": 3.422281521014009, + "loss": 1.1135, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "grad_norm": 1.6077980995178223, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "learning_rate": 0.0006123421880001004, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.7692959308624268, + "step": 10260 + }, + { + "ce_loss": 0.2198968529701233, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.37453675270080566, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.17452290654182434, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.8642454743385315, + "step": 10260 + }, + { + "ce_loss": 0.2118843048810959, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.3453439772129059, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.1575404703617096, + "step": 10260 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.9918, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "grad_norm": 2.540724992752075, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "learning_rate": 0.000611975849363418, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 1.1537764072418213, + "step": 10270 + }, + { + "ce_loss": 0.31698283553123474, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.4698025584220886, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.26983755826950073, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 1.0079697370529175, + "step": 10270 + }, + { + "ce_loss": 0.24113315343856812, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.3796297609806061, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.23882921040058136, + "step": 10270 + }, + { + "epoch": 3.42895263509006, + "loss": 0.955, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "grad_norm": 1.3848929405212402, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "learning_rate": 0.0006116092633281097, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 0.7369468212127686, + "step": 10280 + }, + { + "ce_loss": 0.17177751660346985, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.27149325609207153, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.15882954001426697, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 1.093735933303833, + "step": 10280 + }, + { + "ce_loss": 0.31845054030418396, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.3451046049594879, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.21175621449947357, + "step": 10280 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.9283, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "grad_norm": 3.0060324668884277, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "learning_rate": 0.0006112424303220212, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.8963453769683838, + "step": 10290 + }, + { + "ce_loss": 0.2973220646381378, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.40442851185798645, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.18964999914169312, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 1.4584048986434937, + "step": 10290 + }, + { + "ce_loss": 0.3163163363933563, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.4408752918243408, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.20503944158554077, + "step": 10290 + }, + { + "epoch": 3.4356237491661106, + "loss": 1.0607, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "grad_norm": 2.1007776260375977, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "learning_rate": 0.0006108753507732857, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.8503654599189758, + "step": 10300 + }, + { + "ce_loss": 0.264160692691803, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.38213497400283813, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.16330452263355255, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.7780002355575562, + "step": 10300 + }, + { + "ce_loss": 0.1894020140171051, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.3782142996788025, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.17502427101135254, + "step": 10300 + }, + { + "epoch": 3.438959306204136, + "loss": 0.9531, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "grad_norm": 7.0569610595703125, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "learning_rate": 0.0006105080251103248, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 1.0448821783065796, + "step": 10310 + }, + { + "ce_loss": 0.31122633814811707, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.38214778900146484, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.2752073407173157, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 0.9181085824966431, + "step": 10310 + }, + { + "ce_loss": 0.2127479612827301, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.4602065980434418, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.17844612896442413, + "step": 10310 + }, + { + "epoch": 3.4422948632421613, + "loss": 1.1066, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "grad_norm": 2.075953960418701, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "learning_rate": 0.000610140453761847, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.8715020418167114, + "step": 10320 + }, + { + "ce_loss": 0.19599875807762146, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.4138525128364563, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.18956471979618073, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.7932437658309937, + "step": 10320 + }, + { + "ce_loss": 0.22833150625228882, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.3652595281600952, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.16161790490150452, + "step": 10320 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.9451, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "grad_norm": 1.7833718061447144, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "learning_rate": 0.0006097726371568475, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 1.0605350732803345, + "step": 10330 + }, + { + "ce_loss": 0.2801906168460846, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.5158529281616211, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.2109398990869522, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.8374050855636597, + "step": 10330 + }, + { + "ce_loss": 0.23242981731891632, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.3921404778957367, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.21246527135372162, + "step": 10330 + }, + { + "epoch": 3.448965977318212, + "loss": 0.975, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "grad_norm": 2.5993306636810303, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "learning_rate": 0.0006094045757246081, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.868039071559906, + "step": 10340 + }, + { + "ce_loss": 0.20514139533042908, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.4160865843296051, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.1861317902803421, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.841340959072113, + "step": 10340 + }, + { + "ce_loss": 0.22988004982471466, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.3877439498901367, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.17941595613956451, + "step": 10340 + }, + { + "epoch": 3.4523015343562373, + "loss": 1.0176, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "grad_norm": 2.8510682582855225, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "learning_rate": 0.000609036269894696, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.9242404699325562, + "step": 10350 + }, + { + "ce_loss": 0.2583213150501251, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.35674169659614563, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.1694251000881195, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 1.3669097423553467, + "step": 10350 + }, + { + "ce_loss": 0.28417572379112244, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.510482668876648, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.22851471602916718, + "step": 10350 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.9795, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "grad_norm": 1.5467017889022827, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "learning_rate": 0.0006086677200969636, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 1.1130493879318237, + "step": 10360 + }, + { + "ce_loss": 0.31027817726135254, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.4479835033416748, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.21718619763851166, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.9362666010856628, + "step": 10360 + }, + { + "ce_loss": 0.2514147162437439, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.3675157427787781, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.20163263380527496, + "step": 10360 + }, + { + "epoch": 3.458972648432288, + "loss": 0.9831, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "grad_norm": 2.5050151348114014, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "learning_rate": 0.0006082989267615483, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 1.8880741596221924, + "step": 10370 + }, + { + "ce_loss": 0.2876355051994324, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.4782940447330475, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.16375888884067535, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 1.1421231031417847, + "step": 10370 + }, + { + "ce_loss": 0.29885339736938477, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.5493630170822144, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.24733687937259674, + "step": 10370 + }, + { + "epoch": 3.4623082054703134, + "loss": 1.0893, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "grad_norm": 1.438820242881775, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "learning_rate": 0.0006079298903188715, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 1.4203903675079346, + "step": 10380 + }, + { + "ce_loss": 0.3444005250930786, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.48849883675575256, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.2962939143180847, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.9848767518997192, + "step": 10380 + }, + { + "ce_loss": 0.2705131471157074, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.45133188366889954, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.21035648882389069, + "step": 10380 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.9829, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "grad_norm": 2.2390544414520264, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "learning_rate": 0.0006075606111996386, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 1.0953900814056396, + "step": 10390 + }, + { + "ce_loss": 0.31749287247657776, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.4555628001689911, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.23297156393527985, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.8897223472595215, + "step": 10390 + }, + { + "ce_loss": 0.20808438956737518, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.3228724002838135, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.16926991939544678, + "step": 10390 + }, + { + "epoch": 3.468979319546364, + "loss": 0.9657, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "grad_norm": 2.1007516384124756, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "learning_rate": 0.000607191089834838, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 0.9438608288764954, + "step": 10400 + }, + { + "ce_loss": 0.3016405701637268, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.41761133074760437, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.22418124973773956, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 1.3007851839065552, + "step": 10400 + }, + { + "ce_loss": 0.3035561144351959, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.48503366112709045, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.15072081983089447, + "step": 10400 + }, + { + "epoch": 3.4723148765843894, + "loss": 1.0097, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "grad_norm": 1.3137832880020142, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "learning_rate": 0.0006068213266557409, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 1.0803608894348145, + "step": 10410 + }, + { + "ce_loss": 0.26398447155952454, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.4109332263469696, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.187424898147583, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 1.0211793184280396, + "step": 10410 + }, + { + "ce_loss": 0.3377877473831177, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.4217336177825928, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.21612432599067688, + "step": 10410 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.9997, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "grad_norm": 1.4655508995056152, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "learning_rate": 0.0006064513220939006, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 1.1324905157089233, + "step": 10420 + }, + { + "ce_loss": 0.262099027633667, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.3017611503601074, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.20544975996017456, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.9686199426651001, + "step": 10420 + }, + { + "ce_loss": 0.23206327855587006, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.3936845362186432, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.16485126316547394, + "step": 10420 + }, + { + "epoch": 3.47898599066044, + "loss": 0.9794, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "grad_norm": 1.958141803741455, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "learning_rate": 0.0006060810765811525, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.9960145950317383, + "step": 10430 + }, + { + "ce_loss": 0.2033989578485489, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.3653845489025116, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.16449347138404846, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.7821276187896729, + "step": 10430 + }, + { + "ce_loss": 0.23763516545295715, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.38961201906204224, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.15473511815071106, + "step": 10430 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.9866, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "grad_norm": 2.7399749755859375, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "learning_rate": 0.0006057105905496125, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 1.2616089582443237, + "step": 10440 + }, + { + "ce_loss": 0.257465124130249, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.5176580548286438, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.18606305122375488, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.9673529863357544, + "step": 10440 + }, + { + "ce_loss": 0.2233096808195114, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.42469459772109985, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.16961297392845154, + "step": 10440 + }, + { + "epoch": 3.485657104736491, + "loss": 1.0652, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "grad_norm": 1.5313483476638794, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "learning_rate": 0.0006053398644316782, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 1.1006319522857666, + "step": 10450 + }, + { + "ce_loss": 0.29097020626068115, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.37788423895835876, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.2049952894449234, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 1.045691728591919, + "step": 10450 + }, + { + "ce_loss": 0.193770632147789, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.37574324011802673, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.17712724208831787, + "step": 10450 + }, + { + "epoch": 3.488992661774516, + "loss": 0.9613, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "grad_norm": 2.053473711013794, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "learning_rate": 0.0006049688986600266, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 0.8386645317077637, + "step": 10460 + }, + { + "ce_loss": 0.2251548320055008, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.2944331169128418, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.20060141384601593, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 1.4073541164398193, + "step": 10460 + }, + { + "ce_loss": 0.261447548866272, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.4781648814678192, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.19439777731895447, + "step": 10460 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.9895, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "grad_norm": 1.6879146099090576, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "learning_rate": 0.0006045976936676147, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 1.052851676940918, + "step": 10470 + }, + { + "ce_loss": 0.2677856683731079, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.4257782995700836, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.22846080362796783, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 1.032060980796814, + "step": 10470 + }, + { + "ce_loss": 0.26760971546173096, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.46396347880363464, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.22398428618907928, + "step": 10470 + }, + { + "epoch": 3.495663775850567, + "loss": 1.0215, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "grad_norm": 1.6211903095245361, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "learning_rate": 0.0006042262498876785, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.766634464263916, + "step": 10480 + }, + { + "ce_loss": 0.21379749476909637, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.324904203414917, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.17826105654239655, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.9297152757644653, + "step": 10480 + }, + { + "ce_loss": 0.20599283277988434, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.37327367067337036, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.15590085089206696, + "step": 10480 + }, + { + "epoch": 3.498999332888592, + "loss": 1.0309, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "grad_norm": 1.9489325284957886, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "learning_rate": 0.0006038545677537333, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 0.8805686235427856, + "step": 10490 + }, + { + "ce_loss": 0.15386821329593658, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.38175851106643677, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.13805674016475677, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 1.1114535331726074, + "step": 10490 + }, + { + "ce_loss": 0.32401880621910095, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.46975576877593994, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.24954313039779663, + "step": 10490 + }, + { + "epoch": 3.502334889926618, + "loss": 0.9896, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "grad_norm": 1.5480339527130127, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "learning_rate": 0.0006034826476995715, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 1.0643081665039062, + "step": 10500 + }, + { + "ce_loss": 0.24576300382614136, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.3770397901535034, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.20040734112262726, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 0.807522177696228, + "step": 10500 + }, + { + "ce_loss": 0.21417485177516937, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.4244995713233948, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.16867026686668396, + "step": 10500 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.9092, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "grad_norm": 1.5492361783981323, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "learning_rate": 0.0006031104901592645, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.974293053150177, + "step": 10510 + }, + { + "ce_loss": 0.25700998306274414, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.3952081799507141, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.19461868703365326, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 1.076209306716919, + "step": 10510 + }, + { + "ce_loss": 0.26486635208129883, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.460548460483551, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.19604913890361786, + "step": 10510 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.9397, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "grad_norm": 2.0866904258728027, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "learning_rate": 0.0006027380955671598, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.7660278677940369, + "step": 10520 + }, + { + "ce_loss": 0.21146520972251892, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.3692725598812103, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.1851666420698166, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.8923248052597046, + "step": 10520 + }, + { + "ce_loss": 0.19822822511196136, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.441368967294693, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.1543390154838562, + "step": 10520 + }, + { + "epoch": 3.512341561040694, + "loss": 0.9011, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "grad_norm": 1.7885452508926392, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "learning_rate": 0.000602365464357882, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 1.063278079032898, + "step": 10530 + }, + { + "ce_loss": 0.31426993012428284, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.34604477882385254, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.26331233978271484, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 0.5608228445053101, + "step": 10530 + }, + { + "ce_loss": 0.16896271705627441, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.24089543521404266, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.11015720665454865, + "step": 10530 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.9445, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "grad_norm": 1.596814751625061, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "learning_rate": 0.0006019925969663319, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 1.0660908222198486, + "step": 10540 + }, + { + "ce_loss": 0.24070100486278534, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.3578681945800781, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.2396494448184967, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 1.1939899921417236, + "step": 10540 + }, + { + "ce_loss": 0.25851884484291077, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.4415932297706604, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.23945797979831696, + "step": 10540 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.969, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "grad_norm": 2.290989875793457, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "learning_rate": 0.000601619493827686, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.8484582901000977, + "step": 10550 + }, + { + "ce_loss": 0.2210330218076706, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.4313220977783203, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.19588512182235718, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 1.1229126453399658, + "step": 10550 + }, + { + "ce_loss": 0.25314053893089294, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.41929417848587036, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.14629621803760529, + "step": 10550 + }, + { + "epoch": 3.52234823215477, + "loss": 1.0361, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "grad_norm": 1.6395354270935059, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "learning_rate": 0.0006012461553773955, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 0.9260293841362, + "step": 10560 + }, + { + "ce_loss": 0.29744088649749756, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.3920683264732361, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.23435044288635254, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 0.9376093149185181, + "step": 10560 + }, + { + "ce_loss": 0.22937791049480438, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.3874242305755615, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.18697050213813782, + "step": 10560 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.9756, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "grad_norm": 2.079847574234009, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "learning_rate": 0.0006008725820511866, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.8667013049125671, + "step": 10570 + }, + { + "ce_loss": 0.23142340779304504, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.4217209815979004, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.1697525829076767, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.9974404573440552, + "step": 10570 + }, + { + "ce_loss": 0.2349308580160141, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.5080359578132629, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.1930299997329712, + "step": 10570 + }, + { + "epoch": 3.529019346230821, + "loss": 0.9553, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "grad_norm": 2.677853584289551, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "learning_rate": 0.0006004987742850598, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.8715407252311707, + "step": 10580 + }, + { + "ce_loss": 0.24808406829833984, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.41015133261680603, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.2120196670293808, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.949081540107727, + "step": 10580 + }, + { + "ce_loss": 0.25341686606407166, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.3998900055885315, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.22252270579338074, + "step": 10580 + }, + { + "epoch": 3.532354903268846, + "loss": 1.0076, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "grad_norm": 2.008328676223755, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "learning_rate": 0.0006001247325152887, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 1.12415611743927, + "step": 10590 + }, + { + "ce_loss": 0.2488483190536499, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.4674426019191742, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.19241824746131897, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 0.8905547857284546, + "step": 10590 + }, + { + "ce_loss": 0.2781248986721039, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.4134596586227417, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.19820138812065125, + "step": 10590 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.9653, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "grad_norm": 3.509645462036133, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "learning_rate": 0.0005997504571784207, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.8519524931907654, + "step": 10600 + }, + { + "ce_loss": 0.2490682452917099, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.3506883680820465, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.20163153111934662, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.9452041983604431, + "step": 10600 + }, + { + "ce_loss": 0.24201901257038116, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.41681554913520813, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.14725689589977264, + "step": 10600 + }, + { + "epoch": 3.539026017344897, + "loss": 0.9389, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "grad_norm": 2.2754478454589844, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "learning_rate": 0.000599375948711275, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 1.0604487657546997, + "step": 10610 + }, + { + "ce_loss": 0.2615657150745392, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.4435419738292694, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.19277401268482208, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 1.0847976207733154, + "step": 10610 + }, + { + "ce_loss": 0.24040797352790833, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.3454318940639496, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.18763557076454163, + "step": 10610 + }, + { + "epoch": 3.542361574382922, + "loss": 1.0434, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "grad_norm": 1.3464022874832153, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "learning_rate": 0.0005990012075509434, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.877839207649231, + "step": 10620 + }, + { + "ce_loss": 0.20063342154026031, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.35610562562942505, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.1908491998910904, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.750281572341919, + "step": 10620 + }, + { + "ce_loss": 0.2147495448589325, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.37057164311408997, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.16475652158260345, + "step": 10620 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.9319, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "grad_norm": 1.7371325492858887, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "learning_rate": 0.000598626234134789, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.891808807849884, + "step": 10630 + }, + { + "ce_loss": 0.24432647228240967, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.45418083667755127, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.1522151380777359, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.8573769330978394, + "step": 10630 + }, + { + "ce_loss": 0.25155410170555115, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.3957569897174835, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.15537843108177185, + "step": 10630 + }, + { + "epoch": 3.549032688458973, + "loss": 0.9607, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "grad_norm": 1.7045210599899292, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "learning_rate": 0.0005982510289004467, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 0.812683641910553, + "step": 10640 + }, + { + "ce_loss": 0.2365237921476364, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.39517346024513245, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.1795228272676468, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 1.6365201473236084, + "step": 10640 + }, + { + "ce_loss": 0.3631623089313507, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.43889060616493225, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.2709457576274872, + "step": 10640 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.9761, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "grad_norm": 2.293968915939331, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "learning_rate": 0.0005978755922858205, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.8294329047203064, + "step": 10650 + }, + { + "ce_loss": 0.23119644820690155, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.3696436285972595, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.17142550647258759, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.8790150880813599, + "step": 10650 + }, + { + "ce_loss": 0.23846159875392914, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.4104042649269104, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.1691407859325409, + "step": 10650 + }, + { + "epoch": 3.5557038025350236, + "loss": 1.0186, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "grad_norm": 2.333033800125122, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "learning_rate": 0.0005974999247290862, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 1.0636610984802246, + "step": 10660 + }, + { + "ce_loss": 0.27940917015075684, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.35590964555740356, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.1658158302307129, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.8477701544761658, + "step": 10660 + }, + { + "ce_loss": 0.18838250637054443, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.31615662574768066, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.1694744974374771, + "step": 10660 + }, + { + "epoch": 3.559039359573049, + "loss": 0.9494, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "grad_norm": 2.738006114959717, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "learning_rate": 0.0005971240266686877, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 0.9802939891815186, + "step": 10670 + }, + { + "ce_loss": 0.3043555021286011, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.4548279047012329, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.17479896545410156, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 1.0556284189224243, + "step": 10670 + }, + { + "ce_loss": 0.29174360632896423, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.4765718877315521, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.1979597806930542, + "step": 10670 + }, + { + "epoch": 3.5623749166110743, + "loss": 1.0208, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "grad_norm": 2.6067469120025635, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "learning_rate": 0.0005967478985433387, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 1.05019211769104, + "step": 10680 + }, + { + "ce_loss": 0.27029672265052795, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.5363603234291077, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.19258911907672882, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 1.0431991815567017, + "step": 10680 + }, + { + "ce_loss": 0.3473112881183624, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.4352015554904938, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.20662081241607666, + "step": 10680 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.9695, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "grad_norm": 1.6148370504379272, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "learning_rate": 0.000596371540792021, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 1.0217854976654053, + "step": 10690 + }, + { + "ce_loss": 0.290711373090744, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.40897056460380554, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.1742202490568161, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.9119325280189514, + "step": 10690 + }, + { + "ce_loss": 0.19622530043125153, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.344217449426651, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.17378860712051392, + "step": 10690 + }, + { + "epoch": 3.569046030687125, + "loss": 1.0141, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "grad_norm": 2.3237545490264893, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "learning_rate": 0.000595994953853985, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 1.8157587051391602, + "step": 10700 + }, + { + "ce_loss": 0.27670037746429443, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.43991291522979736, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.19433966279029846, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 1.0240585803985596, + "step": 10700 + }, + { + "ce_loss": 0.3436286449432373, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.40613216161727905, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.2552061676979065, + "step": 10700 + }, + { + "epoch": 3.5723815877251504, + "loss": 1.0385, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "grad_norm": 1.6229546070098877, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "learning_rate": 0.0005956181381687477, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.8478896617889404, + "step": 10710 + }, + { + "ce_loss": 0.252267062664032, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.3502632975578308, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.19959960877895355, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.965064287185669, + "step": 10710 + }, + { + "ce_loss": 0.3000277876853943, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.4041358530521393, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.20794537663459778, + "step": 10710 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.9149, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "grad_norm": 1.5637110471725464, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "learning_rate": 0.000595241094176094, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.9542688131332397, + "step": 10720 + }, + { + "ce_loss": 0.26275670528411865, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.4302704334259033, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.23514986038208008, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.9637356996536255, + "step": 10720 + }, + { + "ce_loss": 0.2058098167181015, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.4770658016204834, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.1876608431339264, + "step": 10720 + }, + { + "epoch": 3.579052701801201, + "loss": 0.9496, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "grad_norm": 2.5232837200164795, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "learning_rate": 0.0005948638223160744, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 1.3978841304779053, + "step": 10730 + }, + { + "ce_loss": 0.23244185745716095, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.3715016841888428, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.1973060518503189, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 1.1500132083892822, + "step": 10730 + }, + { + "ce_loss": 0.2621765732765198, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.33173489570617676, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.20012745261192322, + "step": 10730 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.9951, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "grad_norm": 2.035033702850342, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "learning_rate": 0.000594486323029006, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 1.2532624006271362, + "step": 10740 + }, + { + "ce_loss": 0.24884285032749176, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.38106995820999146, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.23058843612670898, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.8896252512931824, + "step": 10740 + }, + { + "ce_loss": 0.21895568072795868, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.45231279730796814, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.21800543367862701, + "step": 10740 + }, + { + "epoch": 3.5857238158772518, + "loss": 1.1031, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "grad_norm": 1.63994300365448, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "learning_rate": 0.0005941085967554711, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 1.191534161567688, + "step": 10750 + }, + { + "ce_loss": 0.1863107532262802, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.34788045287132263, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.12304037064313889, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.9901374578475952, + "step": 10750 + }, + { + "ce_loss": 0.3216560482978821, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.4349941313266754, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.1874067634344101, + "step": 10750 + }, + { + "epoch": 3.589059372915277, + "loss": 0.9665, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "grad_norm": 1.9990334510803223, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "learning_rate": 0.0005937306439363168, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 1.1465106010437012, + "step": 10760 + }, + { + "ce_loss": 0.2501791715621948, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.4361908435821533, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.13454364240169525, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 1.2411580085754395, + "step": 10760 + }, + { + "ce_loss": 0.30965420603752136, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.38276880979537964, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.18235190212726593, + "step": 10760 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.9779, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "grad_norm": 1.4198142290115356, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "learning_rate": 0.0005933524650126546, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 1.104905128479004, + "step": 10770 + }, + { + "ce_loss": 0.2916855216026306, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.4638969302177429, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.25216418504714966, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.8804208040237427, + "step": 10770 + }, + { + "ce_loss": 0.22146829962730408, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.37109169363975525, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.16120927035808563, + "step": 10770 + }, + { + "epoch": 3.595730486991328, + "loss": 0.9963, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "grad_norm": 8.41401195526123, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "learning_rate": 0.0005929740604258603, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 0.9323166012763977, + "step": 10780 + }, + { + "ce_loss": 0.2917608916759491, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.42487627267837524, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.1706487238407135, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 1.0221028327941895, + "step": 10780 + }, + { + "ce_loss": 0.25380828976631165, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.38979148864746094, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.20026403665542603, + "step": 10780 + }, + { + "epoch": 3.599066044029353, + "loss": 0.9955, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "grad_norm": 3.6208415031433105, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "learning_rate": 0.0005925954306175725, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.9474771618843079, + "step": 10790 + }, + { + "ce_loss": 0.32380348443984985, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.39571648836135864, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.17513123154640198, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.7710244059562683, + "step": 10790 + }, + { + "ce_loss": 0.2452814131975174, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.3207455277442932, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.1587689220905304, + "step": 10790 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.981, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "grad_norm": 1.6106643676757812, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "learning_rate": 0.0005922165760296932, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.6965634226799011, + "step": 10800 + }, + { + "ce_loss": 0.23448844254016876, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.317073792219162, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.14483723044395447, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 1.04850435256958, + "step": 10800 + }, + { + "ce_loss": 0.2612713575363159, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.4225085377693176, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.2115650475025177, + "step": 10800 + }, + { + "epoch": 3.605737158105404, + "loss": 1.0208, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "grad_norm": 1.9757091999053955, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "learning_rate": 0.0005918374971043862, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 1.018848180770874, + "step": 10810 + }, + { + "ce_loss": 0.2821508049964905, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.4402734339237213, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.18855346739292145, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 1.0459314584732056, + "step": 10810 + }, + { + "ce_loss": 0.26500678062438965, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.43279725313186646, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.23459219932556152, + "step": 10810 + }, + { + "epoch": 3.609072715143429, + "loss": 0.9469, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "grad_norm": 1.9686979055404663, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "learning_rate": 0.0005914581942840775, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 0.6443910002708435, + "step": 10820 + }, + { + "ce_loss": 0.15013271570205688, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.33419257402420044, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.15990029275417328, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 1.009469747543335, + "step": 10820 + }, + { + "ce_loss": 0.24576064944267273, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.4632691740989685, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.18871477246284485, + "step": 10820 + }, + { + "epoch": 3.6124082721814545, + "loss": 1.0257, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "grad_norm": 2.7029173374176025, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "learning_rate": 0.0005910786680114544, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.9071427583694458, + "step": 10830 + }, + { + "ce_loss": 0.2030387669801712, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.3390215337276459, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.1907871961593628, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.8759766817092896, + "step": 10830 + }, + { + "ce_loss": 0.2447219043970108, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.4003208875656128, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.17160019278526306, + "step": 10830 + }, + { + "epoch": 3.61574382921948, + "loss": 0.997, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "grad_norm": 2.0608105659484863, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "learning_rate": 0.0005906989187294649, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.8163762092590332, + "step": 10840 + }, + { + "ce_loss": 0.20944543182849884, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.3983843922615051, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.1529301255941391, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.7047134041786194, + "step": 10840 + }, + { + "ce_loss": 0.17927296459674835, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.37407517433166504, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.15111172199249268, + "step": 10840 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.9361, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "grad_norm": 6.514796257019043, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "learning_rate": 0.0005903189468813169, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 1.120654582977295, + "step": 10850 + }, + { + "ce_loss": 0.30272042751312256, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.4027269780635834, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.23278942704200745, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 1.0802335739135742, + "step": 10850 + }, + { + "ce_loss": 0.29891490936279297, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.437322199344635, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.2128477692604065, + "step": 10850 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.9784, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "grad_norm": 2.744908332824707, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "learning_rate": 0.000589938752910479, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 1.092473030090332, + "step": 10860 + }, + { + "ce_loss": 0.1885896474123001, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.4350079298019409, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.1690068542957306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.9842666387557983, + "step": 10860 + }, + { + "ce_loss": 0.26728078722953796, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.42367982864379883, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.24502113461494446, + "step": 10860 + }, + { + "epoch": 3.625750500333556, + "loss": 0.9453, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "grad_norm": 2.9420664310455322, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "learning_rate": 0.000589558337260678, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 0.8280543088912964, + "step": 10870 + }, + { + "ce_loss": 0.21766088902950287, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.3599540591239929, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.17570890486240387, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 1.2867872714996338, + "step": 10870 + }, + { + "ce_loss": 0.32790401577949524, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.4899286925792694, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.2533458173274994, + "step": 10870 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.9934, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "grad_norm": 1.9164032936096191, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "learning_rate": 0.0005891777003759002, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 1.0848420858383179, + "step": 10880 + }, + { + "ce_loss": 0.34023869037628174, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.47535526752471924, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.2687413692474365, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.7790800333023071, + "step": 10880 + }, + { + "ce_loss": 0.20252028107643127, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.35906457901000977, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.16440549492835999, + "step": 10880 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.918, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "grad_norm": 1.9566514492034912, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "learning_rate": 0.0005887968427003898, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.8718944191932678, + "step": 10890 + }, + { + "ce_loss": 0.2936494052410126, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.35506755113601685, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.22286319732666016, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.9113027453422546, + "step": 10890 + }, + { + "ce_loss": 0.26407554745674133, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.38341107964515686, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.2064923644065857, + "step": 10890 + }, + { + "epoch": 3.635757171447632, + "loss": 0.9026, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "grad_norm": 2.067293405532837, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "learning_rate": 0.0005884157646786482, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 1.0706310272216797, + "step": 10900 + }, + { + "ce_loss": 0.2915562689304352, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.37129735946655273, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.26088154315948486, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 0.9378951787948608, + "step": 10900 + }, + { + "ce_loss": 0.22501979768276215, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.4099835455417633, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.20775926113128662, + "step": 10900 + }, + { + "epoch": 3.6390927284856573, + "loss": 1.0149, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "grad_norm": 1.3769757747650146, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "learning_rate": 0.0005880344667554353, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.9676283597946167, + "step": 10910 + }, + { + "ce_loss": 0.27713117003440857, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.4113433063030243, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.18202325701713562, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.9402757883071899, + "step": 10910 + }, + { + "ce_loss": 0.276583731174469, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.4893118143081665, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.17388537526130676, + "step": 10910 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.984, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "grad_norm": 1.5481610298156738, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "learning_rate": 0.0005876529493757661, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 1.0869348049163818, + "step": 10920 + }, + { + "ce_loss": 0.19671200215816498, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.36423802375793457, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.14919424057006836, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.9016743898391724, + "step": 10920 + }, + { + "ce_loss": 0.18895894289016724, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.39317190647125244, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.15948988497257233, + "step": 10920 + }, + { + "epoch": 3.645763842561708, + "loss": 0.9071, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "grad_norm": 1.7900937795639038, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "learning_rate": 0.0005872712129849128, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 1.1257048845291138, + "step": 10930 + }, + { + "ce_loss": 0.2748940885066986, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.4411807060241699, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.15890592336654663, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 0.7515501379966736, + "step": 10930 + }, + { + "ce_loss": 0.20219367742538452, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.336357057094574, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.16738320887088776, + "step": 10930 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.9312, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "grad_norm": 1.6814641952514648, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "learning_rate": 0.0005868892580284026, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.9259946346282959, + "step": 10940 + }, + { + "ce_loss": 0.1978679597377777, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.39133840799331665, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.2005710005760193, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.7356423735618591, + "step": 10940 + }, + { + "ce_loss": 0.20834468305110931, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.3353995084762573, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.11429071426391602, + "step": 10940 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.9866, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "grad_norm": 1.6673353910446167, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "learning_rate": 0.0005865070849520184, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 1.0651073455810547, + "step": 10950 + }, + { + "ce_loss": 0.26207685470581055, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.39236509799957275, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.17330333590507507, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.9178107380867004, + "step": 10950 + }, + { + "ce_loss": 0.2152654230594635, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.3327520191669464, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.1945221871137619, + "step": 10950 + }, + { + "epoch": 3.655770513675784, + "loss": 0.9223, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "grad_norm": 2.0015439987182617, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "learning_rate": 0.0005861246942017968, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.7507296800613403, + "step": 10960 + }, + { + "ce_loss": 0.19525940716266632, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.3391239643096924, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.15782354772090912, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.6286138892173767, + "step": 10960 + }, + { + "ce_loss": 0.1707758605480194, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.325600802898407, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.1315843015909195, + "step": 10960 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.94, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "grad_norm": 1.5580071210861206, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "learning_rate": 0.0005857420862240293, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 1.1839573383331299, + "step": 10970 + }, + { + "ce_loss": 0.22133544087409973, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.34468215703964233, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.19706854224205017, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.7060577273368835, + "step": 10970 + }, + { + "ce_loss": 0.17872220277786255, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.35810860991477966, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.16883009672164917, + "step": 10970 + }, + { + "epoch": 3.662441627751835, + "loss": 1.0542, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "grad_norm": 2.0040171146392822, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "learning_rate": 0.0005853592614652605, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.7581341862678528, + "step": 10980 + }, + { + "ce_loss": 0.21842674911022186, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.3178858160972595, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.18037572503089905, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.9531198143959045, + "step": 10980 + }, + { + "ce_loss": 0.27651041746139526, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.44763290882110596, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.15781889855861664, + "step": 10980 + }, + { + "epoch": 3.66577718478986, + "loss": 0.9635, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "grad_norm": 2.4893198013305664, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "learning_rate": 0.0005849762203722882, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 0.7130229473114014, + "step": 10990 + }, + { + "ce_loss": 0.2273818701505661, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.2687847912311554, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.21665076911449432, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 1.124692678451538, + "step": 10990 + }, + { + "ce_loss": 0.2515711486339569, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.3417477011680603, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.21203409135341644, + "step": 10990 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.8726, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "grad_norm": 1.6779652833938599, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "learning_rate": 0.0005845929633921623, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.9800863265991211, + "step": 11000 + }, + { + "ce_loss": 0.28845295310020447, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.3359338939189911, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.2133590430021286, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.9813275933265686, + "step": 11000 + }, + { + "ce_loss": 0.23539577424526215, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.39678841829299927, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.1863996386528015, + "step": 11000 + }, + { + "epoch": 3.672448298865911, + "loss": 0.9235, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "grad_norm": 2.2331013679504395, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "learning_rate": 0.0005842094909721852, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 0.6382297277450562, + "step": 11010 + }, + { + "ce_loss": 0.14464932680130005, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.32155758142471313, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.171687051653862, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 1.0422011613845825, + "step": 11010 + }, + { + "ce_loss": 0.24493230879306793, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.4178438186645508, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.18016548454761505, + "step": 11010 + }, + { + "epoch": 3.675783855903936, + "loss": 0.9028, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "grad_norm": 3.1568448543548584, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "learning_rate": 0.0005838258035599103, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 1.1515343189239502, + "step": 11020 + }, + { + "ce_loss": 0.20892977714538574, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.4254738688468933, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.19011640548706055, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 1.3563225269317627, + "step": 11020 + }, + { + "ce_loss": 0.288394957780838, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.4447449743747711, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.1841244399547577, + "step": 11020 + }, + { + "epoch": 3.6791194129419615, + "loss": 1.0352, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "grad_norm": 1.798572301864624, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "learning_rate": 0.0005834419016031423, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 1.4670379161834717, + "step": 11030 + }, + { + "ce_loss": 0.27505266666412354, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.4791155755519867, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.23881269991397858, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 0.8022658824920654, + "step": 11030 + }, + { + "ce_loss": 0.19052447378635406, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.4294961392879486, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.1384320855140686, + "step": 11030 + }, + { + "epoch": 3.682454969979987, + "loss": 0.9148, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "grad_norm": 2.133450746536255, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "learning_rate": 0.0005830577855499359, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 1.0161513090133667, + "step": 11040 + }, + { + "ce_loss": 0.2720661759376526, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.41905316710472107, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.23214557766914368, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 0.8970419764518738, + "step": 11040 + }, + { + "ce_loss": 0.22910916805267334, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.37953558564186096, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.17511329054832458, + "step": 11040 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.9784, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "grad_norm": 2.5500400066375732, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "learning_rate": 0.0005826734558485959, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 1.0715348720550537, + "step": 11050 + }, + { + "ce_loss": 0.27266642451286316, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.46318167448043823, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.202993243932724, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.8450025320053101, + "step": 11050 + }, + { + "ce_loss": 0.21356584131717682, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.35461321473121643, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.1333731859922409, + "step": 11050 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.9594, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "grad_norm": 1.717258334159851, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "learning_rate": 0.0005822889129476765, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.6976757049560547, + "step": 11060 + }, + { + "ce_loss": 0.23821891844272614, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.3095102310180664, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.14964300394058228, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.663037121295929, + "step": 11060 + }, + { + "ce_loss": 0.1527738869190216, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.25134557485580444, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.12417822331190109, + "step": 11060 + }, + { + "epoch": 3.692461641094063, + "loss": 0.9893, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "grad_norm": 2.8804426193237305, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "learning_rate": 0.0005819041572959804, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 0.8692727088928223, + "step": 11070 + }, + { + "ce_loss": 0.2544071674346924, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.40960726141929626, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.15840329229831696, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 1.0427039861679077, + "step": 11070 + }, + { + "ce_loss": 0.2617495357990265, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.36591702699661255, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.21506667137145996, + "step": 11070 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.9297, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "grad_norm": 4.881679534912109, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "learning_rate": 0.0005815191893425593, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.8299713730812073, + "step": 11080 + }, + { + "ce_loss": 0.23638109862804413, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.4183204174041748, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.17506512999534607, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 1.1771599054336548, + "step": 11080 + }, + { + "ce_loss": 0.34898120164871216, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.5415797829627991, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.23597785830497742, + "step": 11080 + }, + { + "epoch": 3.6991327551701136, + "loss": 1.0256, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "grad_norm": 2.5172195434570312, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "learning_rate": 0.0005811340095367119, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 1.090559482574463, + "step": 11090 + }, + { + "ce_loss": 0.25162357091903687, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.3787229061126709, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.21387702226638794, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 1.1414086818695068, + "step": 11090 + }, + { + "ce_loss": 0.22263747453689575, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.44168949127197266, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.18623687326908112, + "step": 11090 + }, + { + "epoch": 3.702468312208139, + "loss": 0.9992, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "grad_norm": 2.178071975708008, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "learning_rate": 0.0005807486183279844, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 1.136382818222046, + "step": 11100 + }, + { + "ce_loss": 0.3223716616630554, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.4490458369255066, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.17996446788311005, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 0.9748097658157349, + "step": 11100 + }, + { + "ce_loss": 0.20400959253311157, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.47114190459251404, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.16003276407718658, + "step": 11100 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.9737, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "grad_norm": 1.476382851600647, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "learning_rate": 0.0005803630161661702, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.6786836981773376, + "step": 11110 + }, + { + "ce_loss": 0.16755782067775726, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.32450413703918457, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.14207275211811066, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.9087648987770081, + "step": 11110 + }, + { + "ce_loss": 0.18221266567707062, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.449026882648468, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.16479560732841492, + "step": 11110 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.9624, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "grad_norm": 2.298190116882324, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "learning_rate": 0.000579977203501308, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.9394449591636658, + "step": 11120 + }, + { + "ce_loss": 0.25217294692993164, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.48344358801841736, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.17006704211235046, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.8247861862182617, + "step": 11120 + }, + { + "ce_loss": 0.18868543207645416, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.41085153818130493, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.17868436872959137, + "step": 11120 + }, + { + "epoch": 3.712474983322215, + "loss": 0.9201, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "grad_norm": 1.8907734155654907, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "learning_rate": 0.0005795911807836831, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 1.0551815032958984, + "step": 11130 + }, + { + "ce_loss": 0.2505593001842499, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.36986488103866577, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.20263658463954926, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 0.8181856870651245, + "step": 11130 + }, + { + "ce_loss": 0.2138008326292038, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.3812239170074463, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.17711184918880463, + "step": 11130 + }, + { + "epoch": 3.7158105403602404, + "loss": 1.0005, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "grad_norm": 3.0578932762145996, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "learning_rate": 0.0005792049484638254, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.9433335661888123, + "step": 11140 + }, + { + "ce_loss": 0.24768595397472382, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.45756620168685913, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.16918687522411346, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 1.0306591987609863, + "step": 11140 + }, + { + "ce_loss": 0.3187006711959839, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.3860379755496979, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.26220548152923584, + "step": 11140 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.9562, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "grad_norm": 2.8249945640563965, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "learning_rate": 0.0005788185069925095, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.838791012763977, + "step": 11150 + }, + { + "ce_loss": 0.2277652472257614, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.3474879264831543, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.21634413301944733, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.5670017004013062, + "step": 11150 + }, + { + "ce_loss": 0.15619748830795288, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.2370021492242813, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.1731836497783661, + "step": 11150 + }, + { + "epoch": 3.722481654436291, + "loss": 0.8697, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "grad_norm": 1.6773978471755981, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "learning_rate": 0.0005784318568207546, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 1.0860368013381958, + "step": 11160 + }, + { + "ce_loss": 0.31327375769615173, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.4752776026725769, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.2365313172340393, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 0.9724563956260681, + "step": 11160 + }, + { + "ce_loss": 0.22032691538333893, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.45673760771751404, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.17892897129058838, + "step": 11160 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.9788, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "grad_norm": 1.7261686325073242, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "learning_rate": 0.0005780449983998224, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.8740634322166443, + "step": 11170 + }, + { + "ce_loss": 0.25200915336608887, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.36934924125671387, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.19166813790798187, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.8459035158157349, + "step": 11170 + }, + { + "ce_loss": 0.27757108211517334, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.3620123267173767, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.2055073231458664, + "step": 11170 + }, + { + "epoch": 3.729152768512342, + "loss": 0.9375, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "grad_norm": 2.763355016708374, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "learning_rate": 0.0005776579321812187, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.8284812569618225, + "step": 11180 + }, + { + "ce_loss": 0.2111487090587616, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.4274722635746002, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.189053013920784, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.868732213973999, + "step": 11180 + }, + { + "ce_loss": 0.24561914801597595, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.4243730902671814, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.1535656601190567, + "step": 11180 + }, + { + "epoch": 3.732488325550367, + "loss": 0.9525, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "grad_norm": 1.8700323104858398, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "learning_rate": 0.0005772706586166914, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.8410372734069824, + "step": 11190 + }, + { + "ce_loss": 0.20929786562919617, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.3191835880279541, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.17638051509857178, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.9550619721412659, + "step": 11190 + }, + { + "ce_loss": 0.2897661030292511, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.4045095443725586, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.2052481323480606, + "step": 11190 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.8988, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "grad_norm": 1.7248882055282593, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "learning_rate": 0.0005768831781582304, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.7426757216453552, + "step": 11200 + }, + { + "ce_loss": 0.19427357614040375, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.2836810350418091, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.20022033154964447, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.8273655772209167, + "step": 11200 + }, + { + "ce_loss": 0.21153953671455383, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.3719659447669983, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.18563783168792725, + "step": 11200 + }, + { + "epoch": 3.739159439626418, + "loss": 0.9277, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "grad_norm": 2.4176762104034424, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "learning_rate": 0.000576495491258067, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 1.3678982257843018, + "step": 11210 + }, + { + "ce_loss": 0.2704596221446991, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.3776765465736389, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.22148384153842926, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 1.6562130451202393, + "step": 11210 + }, + { + "ce_loss": 0.15484488010406494, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.3262500464916229, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.15828606486320496, + "step": 11210 + }, + { + "epoch": 3.742494996664443, + "loss": 0.9934, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "grad_norm": 1.7548892498016357, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "learning_rate": 0.0005761075983686738, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.9841724634170532, + "step": 11220 + }, + { + "ce_loss": 0.27450743317604065, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.37013399600982666, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.20493614673614502, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.8894741535186768, + "step": 11220 + }, + { + "ce_loss": 0.31326839327812195, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.3378714621067047, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.19991189241409302, + "step": 11220 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.9505, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "grad_norm": 2.3924026489257812, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "learning_rate": 0.000575719499942763, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.9752727150917053, + "step": 11230 + }, + { + "ce_loss": 0.25048398971557617, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.3495645225048065, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.22064881026744843, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.8900600671768188, + "step": 11230 + }, + { + "ce_loss": 0.2559802830219269, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.42682915925979614, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.1657843142747879, + "step": 11230 + }, + { + "epoch": 3.749166110740494, + "loss": 0.9819, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "grad_norm": 4.172050952911377, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "learning_rate": 0.0005753311964332878, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.7777035236358643, + "step": 11240 + }, + { + "ce_loss": 0.21494880318641663, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.3411872386932373, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.18135559558868408, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.9176045656204224, + "step": 11240 + }, + { + "ce_loss": 0.24806998670101166, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.4173784852027893, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.19084446132183075, + "step": 11240 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.9266, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "grad_norm": 2.111720323562622, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "learning_rate": 0.0005749426882934399, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.9959545731544495, + "step": 11250 + }, + { + "ce_loss": 0.31441760063171387, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.4108336865901947, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.21832722425460815, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 1.1574323177337646, + "step": 11250 + }, + { + "ce_loss": 0.3039410710334778, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.46679091453552246, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.21937808394432068, + "step": 11250 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.8932, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "grad_norm": 1.6136808395385742, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "learning_rate": 0.0005745539759766502, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 1.2521703243255615, + "step": 11260 + }, + { + "ce_loss": 0.2340213805437088, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.44366782903671265, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.20129236578941345, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.8436146974563599, + "step": 11260 + }, + { + "ce_loss": 0.24747362732887268, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.38767534494400024, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.20765142142772675, + "step": 11260 + }, + { + "epoch": 3.75917278185457, + "loss": 0.925, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "grad_norm": 5.605003833770752, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "learning_rate": 0.0005741650599365877, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 1.3394966125488281, + "step": 11270 + }, + { + "ce_loss": 0.3728139400482178, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.4576507806777954, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.21766957640647888, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 0.9282214045524597, + "step": 11270 + }, + { + "ce_loss": 0.22735916078090668, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.38220497965812683, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.14693810045719147, + "step": 11270 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.9751, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "grad_norm": 2.9692952632904053, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "learning_rate": 0.0005737759406271593, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.9229706525802612, + "step": 11280 + }, + { + "ce_loss": 0.20989659428596497, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.40266168117523193, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.18926185369491577, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.7887105345726013, + "step": 11280 + }, + { + "ce_loss": 0.2364315539598465, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.3450629413127899, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.14800573885440826, + "step": 11280 + }, + { + "epoch": 3.7658438959306206, + "loss": 1.0287, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "grad_norm": 3.338712692260742, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "learning_rate": 0.000573386618502509, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 1.0351455211639404, + "step": 11290 + }, + { + "ce_loss": 0.21638186275959015, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.4342281222343445, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.1624898761510849, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.7106236815452576, + "step": 11290 + }, + { + "ce_loss": 0.19261404871940613, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.3417753577232361, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.17591843008995056, + "step": 11290 + }, + { + "epoch": 3.769179452968646, + "loss": 0.9465, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "grad_norm": 2.0295517444610596, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "learning_rate": 0.000572997094017018, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 0.8671320676803589, + "step": 11300 + }, + { + "ce_loss": 0.2617754638195038, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.3775753378868103, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.19144763052463531, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 1.2064995765686035, + "step": 11300 + }, + { + "ce_loss": 0.24847352504730225, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.40565523505210876, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.2336285561323166, + "step": 11300 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.9088, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "grad_norm": 1.7244271039962769, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "learning_rate": 0.0005726073676253029, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.7884076833724976, + "step": 11310 + }, + { + "ce_loss": 0.21592627465724945, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.3208105266094208, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.1611410230398178, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.9448220729827881, + "step": 11310 + }, + { + "ce_loss": 0.2776465117931366, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.3224950432777405, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.16729578375816345, + "step": 11310 + }, + { + "epoch": 3.7758505670446967, + "loss": 0.9252, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "grad_norm": 1.8959004878997803, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "learning_rate": 0.0005722174397822165, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 1.5210129022598267, + "step": 11320 + }, + { + "ce_loss": 0.2503696084022522, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.3779032826423645, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.2040143758058548, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 1.2529124021530151, + "step": 11320 + }, + { + "ce_loss": 0.2998267412185669, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.4451632499694824, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.1881563514471054, + "step": 11320 + }, + { + "epoch": 3.779186124082722, + "loss": 1.0143, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "grad_norm": 1.705941081047058, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "learning_rate": 0.0005718273109428464, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 1.025900959968567, + "step": 11330 + }, + { + "ce_loss": 0.2714976370334625, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.47367486357688904, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.22894425690174103, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 0.8805956840515137, + "step": 11330 + }, + { + "ce_loss": 0.24296237528324127, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.47993454337120056, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.15654857456684113, + "step": 11330 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.9005, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "grad_norm": 5.436649799346924, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "learning_rate": 0.0005714369815625151, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.9933842420578003, + "step": 11340 + }, + { + "ce_loss": 0.24282170832157135, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.43368253111839294, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.18155056238174438, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 1.4033174514770508, + "step": 11340 + }, + { + "ce_loss": 0.29729801416397095, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.45099925994873047, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.1876297891139984, + "step": 11340 + }, + { + "epoch": 3.7858572381587727, + "loss": 1.102, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "grad_norm": 1.883608102798462, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "learning_rate": 0.000571046452096779, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 1.0281188488006592, + "step": 11350 + }, + { + "ce_loss": 0.2619418203830719, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.39741992950439453, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.20731395483016968, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.7758177518844604, + "step": 11350 + }, + { + "ce_loss": 0.19287358224391937, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.3553203046321869, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.14900435507297516, + "step": 11350 + }, + { + "epoch": 3.789192795196798, + "loss": 0.9732, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "grad_norm": 2.2708394527435303, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "learning_rate": 0.0005706557230014278, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.8519456386566162, + "step": 11360 + }, + { + "ce_loss": 0.24678350985050201, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.3918919265270233, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.15620338916778564, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.8818426132202148, + "step": 11360 + }, + { + "ce_loss": 0.23588576912879944, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.36162734031677246, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.22672615945339203, + "step": 11360 + }, + { + "epoch": 3.7925283522348234, + "loss": 1.0084, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "grad_norm": 2.6330015659332275, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "learning_rate": 0.0005702647947324847, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 1.1395282745361328, + "step": 11370 + }, + { + "ce_loss": 0.2408571094274521, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.3482729494571686, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.18775120377540588, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.9223992824554443, + "step": 11370 + }, + { + "ce_loss": 0.26535001397132874, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.3763105869293213, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.24394690990447998, + "step": 11370 + }, + { + "epoch": 3.795863909272849, + "loss": 0.8764, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "grad_norm": 3.5896894931793213, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "learning_rate": 0.0005698736677462048, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.9310532808303833, + "step": 11380 + }, + { + "ce_loss": 0.2635497450828552, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.4105827808380127, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.16474372148513794, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.9801605343818665, + "step": 11380 + }, + { + "ce_loss": 0.2689959406852722, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.40919217467308044, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.17264766991138458, + "step": 11380 + }, + { + "epoch": 3.799199466310874, + "loss": 1.0239, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "grad_norm": 1.7258336544036865, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "learning_rate": 0.0005694823424990755, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 1.012412190437317, + "step": 11390 + }, + { + "ce_loss": 0.30163678526878357, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.40283679962158203, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.23806728422641754, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 0.9156680703163147, + "step": 11390 + }, + { + "ce_loss": 0.23893040418624878, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.40328019857406616, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.15019917488098145, + "step": 11390 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.9566, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "grad_norm": 1.9312031269073486, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "learning_rate": 0.0005690908194478156, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.9236981868743896, + "step": 11400 + }, + { + "ce_loss": 0.27061566710472107, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.40460094809532166, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.20047782361507416, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.7674431204795837, + "step": 11400 + }, + { + "ce_loss": 0.19978666305541992, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.3495452105998993, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.17217499017715454, + "step": 11400 + }, + { + "epoch": 3.805870580386925, + "loss": 0.968, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "grad_norm": 1.6505191326141357, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "learning_rate": 0.0005686990990493743, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 1.1001862287521362, + "step": 11410 + }, + { + "ce_loss": 0.22656288743019104, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.37116605043411255, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.15069936215877533, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 1.0263774394989014, + "step": 11410 + }, + { + "ce_loss": 0.253476619720459, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.41293397545814514, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.23333688080310822, + "step": 11410 + }, + { + "epoch": 3.80920613742495, + "loss": 1.0876, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "grad_norm": 3.7331254482269287, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "learning_rate": 0.0005683071817609316, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 1.2252082824707031, + "step": 11420 + }, + { + "ce_loss": 0.20216085016727448, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.4016602337360382, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.1538444310426712, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 0.8858697414398193, + "step": 11420 + }, + { + "ce_loss": 0.2571501135826111, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.35483574867248535, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.20577125251293182, + "step": 11420 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.9702, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "grad_norm": 1.4017664194107056, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "learning_rate": 0.0005679150680398973, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 1.4945435523986816, + "step": 11430 + }, + { + "ce_loss": 0.35453975200653076, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.43121251463890076, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.21792420744895935, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.8512135148048401, + "step": 11430 + }, + { + "ce_loss": 0.2419276386499405, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.39038610458374023, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.17313846945762634, + "step": 11430 + }, + { + "epoch": 3.815877251501001, + "loss": 0.9297, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "grad_norm": 1.5681135654449463, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "learning_rate": 0.0005675227583439101, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.9723762273788452, + "step": 11440 + }, + { + "ce_loss": 0.318612277507782, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.39654994010925293, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.1935034692287445, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.9978158473968506, + "step": 11440 + }, + { + "ce_loss": 0.2571842670440674, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.34870341420173645, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.22138366103172302, + "step": 11440 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.9286, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "grad_norm": 2.1632747650146484, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "learning_rate": 0.0005671302531308378, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 1.3651049137115479, + "step": 11450 + }, + { + "ce_loss": 0.4106031358242035, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.526943027973175, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.19772358238697052, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.8169754147529602, + "step": 11450 + }, + { + "ce_loss": 0.22076590359210968, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.36730560660362244, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.18142905831336975, + "step": 11450 + }, + { + "epoch": 3.8225483655770516, + "loss": 1.0225, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "grad_norm": 1.8946765661239624, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "learning_rate": 0.000566737552858776, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 0.8395046591758728, + "step": 11460 + }, + { + "ce_loss": 0.21020092070102692, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.3612057566642761, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.18916825950145721, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 1.445246696472168, + "step": 11460 + }, + { + "ce_loss": 0.1898326277732849, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.30803728103637695, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.13109755516052246, + "step": 11460 + }, + { + "epoch": 3.825883922615077, + "loss": 1.0358, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "grad_norm": 1.6582616567611694, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "learning_rate": 0.0005663446579860484, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.9347367286682129, + "step": 11470 + }, + { + "ce_loss": 0.24629852175712585, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.42903152108192444, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.19728568196296692, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.8807161450386047, + "step": 11470 + }, + { + "ce_loss": 0.25386542081832886, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.42778605222702026, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.1473657488822937, + "step": 11470 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.9359, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "grad_norm": 1.6030758619308472, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "learning_rate": 0.0005659515689712055, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 1.6527743339538574, + "step": 11480 + }, + { + "ce_loss": 0.25234341621398926, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.44041571021080017, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.19048476219177246, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 1.0194995403289795, + "step": 11480 + }, + { + "ce_loss": 0.279092013835907, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.4226592481136322, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.1561541110277176, + "step": 11480 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.9889, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "grad_norm": 3.101188898086548, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "learning_rate": 0.0005655582862730246, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.9315988421440125, + "step": 11490 + }, + { + "ce_loss": 0.2899245619773865, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.4321276545524597, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.209343820810318, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.9821953177452087, + "step": 11490 + }, + { + "ce_loss": 0.32735350728034973, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.43529486656188965, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.2194371074438095, + "step": 11490 + }, + { + "epoch": 3.835890593729153, + "loss": 0.9208, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "grad_norm": 2.652716875076294, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "learning_rate": 0.0005651648103505088, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 1.133766531944275, + "step": 11500 + }, + { + "ce_loss": 0.3791617453098297, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.485236793756485, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.22717487812042236, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 1.312686562538147, + "step": 11500 + }, + { + "ce_loss": 0.292243629693985, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.4213583171367645, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.23505744338035583, + "step": 11500 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.9949, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "grad_norm": 1.6241575479507446, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "learning_rate": 0.0005647711416628867, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.8845500349998474, + "step": 11510 + }, + { + "ce_loss": 0.21759924292564392, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.45337334275245667, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.21198280155658722, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 1.3442260026931763, + "step": 11510 + }, + { + "ce_loss": 0.2494586855173111, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.4459645748138428, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.196689635515213, + "step": 11510 + }, + { + "epoch": 3.8425617078052037, + "loss": 1.0527, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "grad_norm": 3.0656344890594482, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "learning_rate": 0.0005643772806696121, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.6794697642326355, + "step": 11520 + }, + { + "ce_loss": 0.18232791125774384, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.2960767447948456, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.16795457899570465, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 1.225640058517456, + "step": 11520 + }, + { + "ce_loss": 0.2971508800983429, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.3605060577392578, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.20262135565280914, + "step": 11520 + }, + { + "epoch": 3.845897264843229, + "loss": 0.9276, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "grad_norm": 2.342433214187622, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "learning_rate": 0.0005639832278303635, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 1.0097906589508057, + "step": 11530 + }, + { + "ce_loss": 0.325705349445343, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.38462504744529724, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.24787509441375732, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 0.7476581335067749, + "step": 11530 + }, + { + "ce_loss": 0.21294884383678436, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.28872600197792053, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.19121286273002625, + "step": 11530 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.9075, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "grad_norm": 1.577726125717163, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "learning_rate": 0.0005635889836050424, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.7826956510543823, + "step": 11540 + }, + { + "ce_loss": 0.19527533650398254, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.3526442050933838, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.18786796927452087, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 1.1441820859909058, + "step": 11540 + }, + { + "ce_loss": 0.2938821017742157, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.4380337595939636, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.19052253663539886, + "step": 11540 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.9338, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "grad_norm": 2.0250110626220703, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "learning_rate": 0.0005631945484537748, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.8967652320861816, + "step": 11550 + }, + { + "ce_loss": 0.25885623693466187, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.3998911380767822, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.17840033769607544, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.7759818434715271, + "step": 11550 + }, + { + "ce_loss": 0.24023577570915222, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.3571813106536865, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.17843982577323914, + "step": 11550 + }, + { + "epoch": 3.855903935957305, + "loss": 0.9064, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "grad_norm": 1.7031183242797852, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "learning_rate": 0.0005627999228369085, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.8540741801261902, + "step": 11560 + }, + { + "ce_loss": 0.26487821340560913, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.3655508756637573, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.1813589483499527, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.9351487159729004, + "step": 11560 + }, + { + "ce_loss": 0.22617729008197784, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.3438546061515808, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.2046925276517868, + "step": 11560 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.8818, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "grad_norm": 1.5661900043487549, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "learning_rate": 0.0005624051072150144, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.9107818603515625, + "step": 11570 + }, + { + "ce_loss": 0.213269904255867, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.2721361219882965, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.18350692093372345, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.9907459020614624, + "step": 11570 + }, + { + "ce_loss": 0.3015082776546478, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.3977634012699127, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.23032258450984955, + "step": 11570 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.9655, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "grad_norm": 2.352884292602539, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "learning_rate": 0.0005620101020488846, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.939232349395752, + "step": 11580 + }, + { + "ce_loss": 0.2103375494480133, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.39442554116249084, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.1931764930486679, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.906415581703186, + "step": 11580 + }, + { + "ce_loss": 0.23109756410121918, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.35593166947364807, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.18460983037948608, + "step": 11580 + }, + { + "epoch": 3.865910607071381, + "loss": 0.9738, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "grad_norm": 2.738985776901245, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "learning_rate": 0.0005616149077995327, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 0.8089038133621216, + "step": 11590 + }, + { + "ce_loss": 0.19172555208206177, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.4226304590702057, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.14394626021385193, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 1.1200426816940308, + "step": 11590 + }, + { + "ce_loss": 0.29103899002075195, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.43065327405929565, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.2459547519683838, + "step": 11590 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.976, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "grad_norm": 1.8246506452560425, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "learning_rate": 0.0005612195249281929, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 1.0592341423034668, + "step": 11600 + }, + { + "ce_loss": 0.2808479368686676, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.3962073028087616, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.19259952008724213, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.7964693307876587, + "step": 11600 + }, + { + "ce_loss": 0.21781563758850098, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.38051968812942505, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.1979907602071762, + "step": 11600 + }, + { + "epoch": 3.872581721147432, + "loss": 0.9616, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "grad_norm": 3.134458065032959, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "learning_rate": 0.0005608239538963196, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.7981612086296082, + "step": 11610 + }, + { + "ce_loss": 0.23766736686229706, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.37494006752967834, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.15241451561450958, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.7920822501182556, + "step": 11610 + }, + { + "ce_loss": 0.2178262621164322, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.3271922767162323, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.17308199405670166, + "step": 11610 + }, + { + "epoch": 3.875917278185457, + "loss": 0.9232, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "grad_norm": 1.5603810548782349, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "learning_rate": 0.0005604281951655868, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.7440937161445618, + "step": 11620 + }, + { + "ce_loss": 0.1296824812889099, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.3723609149456024, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.1284295618534088, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.7650660872459412, + "step": 11620 + }, + { + "ce_loss": 0.19801418483257294, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.3212094008922577, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.14547830820083618, + "step": 11620 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.9491, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "grad_norm": 2.8136508464813232, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "learning_rate": 0.0005600322491978873, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 1.1764713525772095, + "step": 11630 + }, + { + "ce_loss": 0.259009450674057, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.3844780921936035, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.18857301771640778, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 1.0874712467193604, + "step": 11630 + }, + { + "ce_loss": 0.3212282061576843, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.4671300947666168, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.24173882603645325, + "step": 11630 + }, + { + "epoch": 3.882588392261508, + "loss": 1.0431, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "grad_norm": 3.056569814682007, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "learning_rate": 0.0005596361164553328, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.9977152347564697, + "step": 11640 + }, + { + "ce_loss": 0.26923465728759766, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.45488986372947693, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.1783684492111206, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.966017484664917, + "step": 11640 + }, + { + "ce_loss": 0.25669997930526733, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.4465809464454651, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.1974465399980545, + "step": 11640 + }, + { + "epoch": 3.885923949299533, + "loss": 0.9427, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "grad_norm": 1.5865579843521118, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "learning_rate": 0.0005592397974002529, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 0.9701722264289856, + "step": 11650 + }, + { + "ce_loss": 0.29138481616973877, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.42188239097595215, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.19481723010540009, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 1.0660479068756104, + "step": 11650 + }, + { + "ce_loss": 0.33463263511657715, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.49488842487335205, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.19474494457244873, + "step": 11650 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.997, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "grad_norm": 11.462584495544434, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "learning_rate": 0.0005588432924951946, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 1.0220303535461426, + "step": 11660 + }, + { + "ce_loss": 0.22120627760887146, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.4513772130012512, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.15893080830574036, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.7385839819908142, + "step": 11660 + }, + { + "ce_loss": 0.2053300440311432, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.329124391078949, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.1607813835144043, + "step": 11660 + }, + { + "epoch": 3.892595063375584, + "loss": 0.998, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "grad_norm": 3.3075084686279297, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "learning_rate": 0.0005584466022029216, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.818089485168457, + "step": 11670 + }, + { + "ce_loss": 0.20594333112239838, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.3592768609523773, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.19639188051223755, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.8467075228691101, + "step": 11670 + }, + { + "ce_loss": 0.20294122397899628, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.40357089042663574, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.19864407181739807, + "step": 11670 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.9248, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "grad_norm": 1.8866527080535889, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "learning_rate": 0.0005580497269864143, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.9279657602310181, + "step": 11680 + }, + { + "ce_loss": 0.20917311310768127, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.45263731479644775, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.1506700962781906, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 1.6638834476470947, + "step": 11680 + }, + { + "ce_loss": 0.35619089007377625, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.43666911125183105, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.1947658658027649, + "step": 11680 + }, + { + "epoch": 3.8992661774516346, + "loss": 1.0302, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "grad_norm": 2.085873603820801, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "learning_rate": 0.0005576526673088687, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.9703258872032166, + "step": 11690 + }, + { + "ce_loss": 0.30788952112197876, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.44872936606407166, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.21353916823863983, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 1.0158580541610718, + "step": 11690 + }, + { + "ce_loss": 0.27169573307037354, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.4001632630825043, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.14454112946987152, + "step": 11690 + }, + { + "epoch": 3.90260173448966, + "loss": 0.9128, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "grad_norm": 2.307708740234375, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "learning_rate": 0.0005572554236336965, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 0.7074758410453796, + "step": 11700 + }, + { + "ce_loss": 0.2118670642375946, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.36380529403686523, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.1316874474287033, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 1.0797171592712402, + "step": 11700 + }, + { + "ce_loss": 0.36392879486083984, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.4837646484375, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.18095727264881134, + "step": 11700 + }, + { + "epoch": 3.9059372915276853, + "loss": 1.0169, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "grad_norm": 1.657469391822815, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "learning_rate": 0.0005568579964245232, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.8145405650138855, + "step": 11710 + }, + { + "ce_loss": 0.2403329610824585, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.40306556224823, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.1709594428539276, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.7344586849212646, + "step": 11710 + }, + { + "ce_loss": 0.16823998093605042, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.32683515548706055, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.14987175166606903, + "step": 11710 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.9718, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "grad_norm": 1.7074451446533203, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "learning_rate": 0.0005564603861451897, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.9449779391288757, + "step": 11720 + }, + { + "ce_loss": 0.18956990540027618, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.4226230978965759, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.20470745861530304, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.7637391090393066, + "step": 11720 + }, + { + "ce_loss": 0.19855497777462006, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.3552302420139313, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.1477682888507843, + "step": 11720 + }, + { + "epoch": 3.912608405603736, + "loss": 1.0152, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "grad_norm": 2.0933945178985596, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "learning_rate": 0.0005560625932597494, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.9675263166427612, + "step": 11730 + }, + { + "ce_loss": 0.24618081748485565, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.4911355972290039, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.18077616393566132, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.7928896546363831, + "step": 11730 + }, + { + "ce_loss": 0.2026551514863968, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.3882897198200226, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.20181968808174133, + "step": 11730 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.9015, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "grad_norm": 2.6087889671325684, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "learning_rate": 0.00055566461823247, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.7968668937683105, + "step": 11740 + }, + { + "ce_loss": 0.17508544027805328, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.4119051694869995, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.1454639434814453, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 1.1098814010620117, + "step": 11740 + }, + { + "ce_loss": 0.2541578412055969, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.4228516221046448, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.15402543544769287, + "step": 11740 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.9195, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "grad_norm": 1.6107227802276611, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "learning_rate": 0.0005552664615278308, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.9172618389129639, + "step": 11750 + }, + { + "ce_loss": 0.20093008875846863, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.39634275436401367, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.17416979372501373, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.8351260423660278, + "step": 11750 + }, + { + "ce_loss": 0.19515912234783173, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.3535310626029968, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.21191735565662384, + "step": 11750 + }, + { + "epoch": 3.922615076717812, + "loss": 0.8606, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "grad_norm": 1.865551471710205, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "learning_rate": 0.0005548681236105239, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.8535208702087402, + "step": 11760 + }, + { + "ce_loss": 0.20602232217788696, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.4059385657310486, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.1823427379131317, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.8497105240821838, + "step": 11760 + }, + { + "ce_loss": 0.19806364178657532, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.3617856204509735, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.16827456653118134, + "step": 11760 + }, + { + "epoch": 3.9259506337558374, + "loss": 1.0327, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "grad_norm": 1.6781734228134155, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "learning_rate": 0.000554469604945452, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 1.0478616952896118, + "step": 11770 + }, + { + "ce_loss": 0.3283158838748932, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.4395040273666382, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.2273147851228714, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 1.000312328338623, + "step": 11770 + }, + { + "ce_loss": 0.257784366607666, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.40631210803985596, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.19344256818294525, + "step": 11770 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.9852, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "grad_norm": 1.7029778957366943, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "learning_rate": 0.0005540709059977295, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.7573671936988831, + "step": 11780 + }, + { + "ce_loss": 0.21558508276939392, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.37834596633911133, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.16318820416927338, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.8037177324295044, + "step": 11780 + }, + { + "ce_loss": 0.220443993806839, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.42402610182762146, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.1590704619884491, + "step": 11780 + }, + { + "epoch": 3.932621747831888, + "loss": 0.9677, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "grad_norm": 1.564285159111023, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "learning_rate": 0.000553672027232681, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 1.0326200723648071, + "step": 11790 + }, + { + "ce_loss": 0.321227490901947, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.512378454208374, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.19875237345695496, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 0.9585893154144287, + "step": 11790 + }, + { + "ce_loss": 0.29675567150115967, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.4784086346626282, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.18320876359939575, + "step": 11790 + }, + { + "epoch": 3.9359573048699135, + "loss": 1.008, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "grad_norm": 1.9860695600509644, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "learning_rate": 0.000553272969115841, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.743493378162384, + "step": 11800 + }, + { + "ce_loss": 0.22666524350643158, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.33357691764831543, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.18286947906017303, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.6927028894424438, + "step": 11800 + }, + { + "ce_loss": 0.14644154906272888, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.32516467571258545, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.14002791047096252, + "step": 11800 + }, + { + "epoch": 3.939292861907939, + "loss": 0.9237, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "grad_norm": 2.8374104499816895, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "learning_rate": 0.0005528737321129532, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.9271951913833618, + "step": 11810 + }, + { + "ce_loss": 0.3107430040836334, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.37300023436546326, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.1981804221868515, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.9263529181480408, + "step": 11810 + }, + { + "ce_loss": 0.2825232446193695, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.39083096385002136, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.2027083933353424, + "step": 11810 + }, + { + "epoch": 3.942628418945964, + "loss": 0.9221, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "grad_norm": 1.7531052827835083, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "learning_rate": 0.0005524743166899701, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 1.1111127138137817, + "step": 11820 + }, + { + "ce_loss": 0.30061981081962585, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.43286579847335815, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.21395601332187653, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 0.940593957901001, + "step": 11820 + }, + { + "ce_loss": 0.2132023274898529, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.3509421944618225, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.12664273381233215, + "step": 11820 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.9548, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "grad_norm": 2.730525493621826, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "learning_rate": 0.0005520747233130525, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.9735243320465088, + "step": 11830 + }, + { + "ce_loss": 0.26277390122413635, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.391275554895401, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.16927450895309448, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 1.2078938484191895, + "step": 11830 + }, + { + "ce_loss": 0.370376318693161, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.4815659523010254, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.23204077780246735, + "step": 11830 + }, + { + "epoch": 3.949299533022015, + "loss": 1.0518, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "grad_norm": 2.2668051719665527, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "learning_rate": 0.0005516749524485688, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 0.8577396273612976, + "step": 11840 + }, + { + "ce_loss": 0.2821693420410156, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.36145728826522827, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.21394376456737518, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 1.1756106615066528, + "step": 11840 + }, + { + "ce_loss": 0.2654615640640259, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.36142125725746155, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.2146204560995102, + "step": 11840 + }, + { + "epoch": 3.95263509006004, + "loss": 0.9651, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "grad_norm": 1.6232107877731323, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "learning_rate": 0.0005512750045630947, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.8906152248382568, + "step": 11850 + }, + { + "ce_loss": 0.2890780568122864, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.35990801453590393, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.1792559176683426, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.9378436803817749, + "step": 11850 + }, + { + "ce_loss": 0.28084325790405273, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.4056680500507355, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.2064526230096817, + "step": 11850 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.9023, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "grad_norm": 2.2000057697296143, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "learning_rate": 0.0005508748801234127, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.924540102481842, + "step": 11860 + }, + { + "ce_loss": 0.26077377796173096, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.442877858877182, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.16746099293231964, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 1.2667691707611084, + "step": 11860 + }, + { + "ce_loss": 0.26282796263694763, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.46519577503204346, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.20361657440662384, + "step": 11860 + }, + { + "epoch": 3.959306204136091, + "loss": 1.0097, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "grad_norm": 1.7751314640045166, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "learning_rate": 0.0005504745795965104, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 0.9637972116470337, + "step": 11870 + }, + { + "ce_loss": 0.20609210431575775, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.413042277097702, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.17276988923549652, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 1.049441933631897, + "step": 11870 + }, + { + "ce_loss": 0.22047023475170135, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.4326194226741791, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.15497992932796478, + "step": 11870 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.9621, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "grad_norm": 1.6827456951141357, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "learning_rate": 0.0005500741034495822, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.7790173888206482, + "step": 11880 + }, + { + "ce_loss": 0.20525261759757996, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.3322910666465759, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.19715571403503418, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 1.076646089553833, + "step": 11880 + }, + { + "ce_loss": 0.2301023304462433, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.3682442903518677, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.18718735873699188, + "step": 11880 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.8858, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "grad_norm": 1.9867877960205078, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "learning_rate": 0.0005496734521500265, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 1.0159093141555786, + "step": 11890 + }, + { + "ce_loss": 0.253842830657959, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.3500358462333679, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.19169864058494568, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.8605555295944214, + "step": 11890 + }, + { + "ce_loss": 0.2701185941696167, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.4148210883140564, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.1754133701324463, + "step": 11890 + }, + { + "epoch": 3.969312875250167, + "loss": 0.8858, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "grad_norm": 2.906344413757324, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "learning_rate": 0.0005492726261654467, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 1.1624562740325928, + "step": 11900 + }, + { + "ce_loss": 0.2951255738735199, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.47946152091026306, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.23917268216609955, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 1.189023494720459, + "step": 11900 + }, + { + "ce_loss": 0.29075753688812256, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.3917032778263092, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.22798548638820648, + "step": 11900 + }, + { + "epoch": 3.9726484322881923, + "loss": 1.0152, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "grad_norm": 2.9853665828704834, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "learning_rate": 0.0005488716259636498, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.8837149739265442, + "step": 11910 + }, + { + "ce_loss": 0.2542972266674042, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.4023490846157074, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.20137716829776764, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.9087706804275513, + "step": 11910 + }, + { + "ce_loss": 0.29229024052619934, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.36741021275520325, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.2003345787525177, + "step": 11910 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.9513, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "grad_norm": 1.9088329076766968, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "learning_rate": 0.0005484704520126461, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.8984208703041077, + "step": 11920 + }, + { + "ce_loss": 0.24777761101722717, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.38926613330841064, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.2100924551486969, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 1.131884217262268, + "step": 11920 + }, + { + "ce_loss": 0.32054466009140015, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.5047661662101746, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.17037981748580933, + "step": 11920 + }, + { + "epoch": 3.979319546364243, + "loss": 0.9376, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "grad_norm": 1.9283043146133423, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "learning_rate": 0.0005480691047806488, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.8340834975242615, + "step": 11930 + }, + { + "ce_loss": 0.2035832554101944, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.40630069375038147, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.1837412267923355, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.8505741357803345, + "step": 11930 + }, + { + "ce_loss": 0.2108788788318634, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.38408035039901733, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.18300923705101013, + "step": 11930 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.9159, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "grad_norm": 12.302298545837402, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "learning_rate": 0.0005476675847360734, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 1.0876966714859009, + "step": 11940 + }, + { + "ce_loss": 0.25745177268981934, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.45677369832992554, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.19750654697418213, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.9414987564086914, + "step": 11940 + }, + { + "ce_loss": 0.2690792381763458, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.3595500588417053, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.18679878115653992, + "step": 11940 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.9927, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "grad_norm": 1.953128695487976, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "learning_rate": 0.0005472658923475368, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.6685543060302734, + "step": 11950 + }, + { + "ce_loss": 0.11985540390014648, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.36389437317848206, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.1333955079317093, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 1.2167911529541016, + "step": 11950 + }, + { + "ce_loss": 0.3260270655155182, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.4484832286834717, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.16716627776622772, + "step": 11950 + }, + { + "epoch": 3.989326217478319, + "loss": 1.0491, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "grad_norm": 1.5239057540893555, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "learning_rate": 0.0005468640280838575, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 0.9055935740470886, + "step": 11960 + }, + { + "ce_loss": 0.25763407349586487, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.3997820317745209, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.17403864860534668, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 0.8026612997055054, + "step": 11960 + }, + { + "ce_loss": 0.22291156649589539, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.40883633494377136, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.17074339091777802, + "step": 11960 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.921, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "grad_norm": 2.39037823677063, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "learning_rate": 0.0005464619924140541, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.5891600251197815, + "step": 11970 + }, + { + "ce_loss": 0.13960763812065125, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.333700954914093, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.11562500894069672, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.9206572771072388, + "step": 11970 + }, + { + "ce_loss": 0.24288669228553772, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.4398137629032135, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.17862555384635925, + "step": 11970 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.9467, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "grad_norm": 2.8624608516693115, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "learning_rate": 0.0005460597858073456, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 1.3518834114074707, + "step": 11980 + }, + { + "ce_loss": 0.2634531259536743, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.48528337478637695, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.2191203236579895, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.9772735834121704, + "step": 11980 + }, + { + "ce_loss": 0.24835364520549774, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.4489499032497406, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.2041686326265335, + "step": 11980 + }, + { + "epoch": 3.999332888592395, + "loss": 0.9538, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "grad_norm": 1.8687514066696167, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "learning_rate": 0.0005456574087331504, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.6367960572242737, + "step": 11990 + }, + { + "ce_loss": 0.16839922964572906, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.3280557692050934, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.14013376832008362, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.8866681456565857, + "step": 11990 + }, + { + "ce_loss": 0.27331751585006714, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.42752671241760254, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.1855868250131607, + "step": 11990 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.8725, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "grad_norm": 1.8681031465530396, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "learning_rate": 0.0005452548616610858, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.7981876134872437, + "step": 12000 + }, + { + "ce_loss": 0.17341767251491547, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.28598684072494507, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.17627839744091034, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 1.0411994457244873, + "step": 12000 + }, + { + "ce_loss": 0.2961692214012146, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.43006405234336853, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.22421853244304657, + "step": 12000 + }, + { + "epoch": 4.006004002668446, + "loss": 0.897, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "grad_norm": 1.94181489944458, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "learning_rate": 0.0005448521450609677, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 1.054775595664978, + "step": 12010 + }, + { + "ce_loss": 0.2572634816169739, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.5030579566955566, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.21427693963050842, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 0.7503544092178345, + "step": 12010 + }, + { + "ce_loss": 0.2099463790655136, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.33009880781173706, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.2099847048521042, + "step": 12010 + }, + { + "epoch": 4.009339559706471, + "loss": 0.8854, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "grad_norm": 2.251950979232788, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "learning_rate": 0.0005444492594028093, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.91937255859375, + "step": 12020 + }, + { + "ce_loss": 0.21336789429187775, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.41891738772392273, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.1578998565673828, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.9270004034042358, + "step": 12020 + }, + { + "ce_loss": 0.19843682646751404, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.44454413652420044, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.18976789712905884, + "step": 12020 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.9284, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "grad_norm": 1.778459072113037, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "learning_rate": 0.000544046205156822, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.6953597068786621, + "step": 12030 + }, + { + "ce_loss": 0.16227765381336212, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.30457478761672974, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.1753004640340805, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.9509637951850891, + "step": 12030 + }, + { + "ce_loss": 0.23665675520896912, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.43513789772987366, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.1287996470928192, + "step": 12030 + }, + { + "epoch": 4.016010673782522, + "loss": 0.8804, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "grad_norm": 1.5397865772247314, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "learning_rate": 0.0005436429827934133, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.9101223349571228, + "step": 12040 + }, + { + "ce_loss": 0.24761877954006195, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.33264732360839844, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.19348663091659546, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.8087942600250244, + "step": 12040 + }, + { + "ce_loss": 0.18762274086475372, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.39426758885383606, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.16436782479286194, + "step": 12040 + }, + { + "epoch": 4.019346230820547, + "loss": 0.8119, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "grad_norm": 2.189448118209839, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "learning_rate": 0.000543239592783187, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.6664413809776306, + "step": 12050 + }, + { + "ce_loss": 0.17020738124847412, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.3622167706489563, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.13371214270591736, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.9993815422058105, + "step": 12050 + }, + { + "ce_loss": 0.2715194821357727, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.4615587294101715, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.21049873530864716, + "step": 12050 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.8495, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "grad_norm": 1.3513885736465454, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "learning_rate": 0.0005428360355969426, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.9639096856117249, + "step": 12060 + }, + { + "ce_loss": 0.19991366565227509, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.45069777965545654, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.1532137244939804, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.7424845695495605, + "step": 12060 + }, + { + "ce_loss": 0.1836809664964676, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.30216172337532043, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.18060757219791412, + "step": 12060 + }, + { + "epoch": 4.026017344896598, + "loss": 0.8583, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "grad_norm": 1.7382936477661133, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "learning_rate": 0.0005424323117056751, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.8760979771614075, + "step": 12070 + }, + { + "ce_loss": 0.2157914787530899, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.35931938886642456, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.14864644408226013, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.6753813028335571, + "step": 12070 + }, + { + "ce_loss": 0.14919540286064148, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.35045838356018066, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.17523548007011414, + "step": 12070 + }, + { + "epoch": 4.029352901934623, + "loss": 0.8266, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "grad_norm": 1.6985952854156494, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "learning_rate": 0.0005420284215805732, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 1.2062604427337646, + "step": 12080 + }, + { + "ce_loss": 0.23293708264827728, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.46571627259254456, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.2237669676542282, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 1.3210084438323975, + "step": 12080 + }, + { + "ce_loss": 0.19478954374790192, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.348651260137558, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.16467390954494476, + "step": 12080 + }, + { + "epoch": 4.032688458972649, + "loss": 0.8626, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "grad_norm": 2.3197574615478516, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "learning_rate": 0.0005416243656930207, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.8391457200050354, + "step": 12090 + }, + { + "ce_loss": 0.22044244408607483, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.3444957733154297, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.15553082525730133, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.779373824596405, + "step": 12090 + }, + { + "ce_loss": 0.20906268060207367, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.38116130232810974, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.1889278143644333, + "step": 12090 + }, + { + "epoch": 4.036024016010674, + "loss": 0.8207, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "grad_norm": 1.705793857574463, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "learning_rate": 0.0005412201445145939, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.8325830101966858, + "step": 12100 + }, + { + "ce_loss": 0.26227590441703796, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.3504793047904968, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.21957749128341675, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.6191930770874023, + "step": 12100 + }, + { + "ce_loss": 0.1428786665201187, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.2747659683227539, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.15719301998615265, + "step": 12100 + }, + { + "epoch": 4.039359573048699, + "loss": 0.8851, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "grad_norm": 1.9953546524047852, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "learning_rate": 0.0005408157585170625, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.7786689400672913, + "step": 12110 + }, + { + "ce_loss": 0.18109384179115295, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.3678598999977112, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.1337146908044815, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.8011458516120911, + "step": 12110 + }, + { + "ce_loss": 0.18645349144935608, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.39759284257888794, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.16702933609485626, + "step": 12110 + }, + { + "epoch": 4.042695130086725, + "loss": 0.922, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "grad_norm": 2.8055052757263184, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "learning_rate": 0.0005404112081723885, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.7414431571960449, + "step": 12120 + }, + { + "ce_loss": 0.17341484129428864, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.3635369539260864, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.15303896367549896, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.7546785473823547, + "step": 12120 + }, + { + "ce_loss": 0.17873521149158478, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.34869545698165894, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.15745292603969574, + "step": 12120 + }, + { + "epoch": 4.04603068712475, + "loss": 0.8738, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "grad_norm": 1.4640624523162842, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "learning_rate": 0.0005400064939527257, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.9559147953987122, + "step": 12130 + }, + { + "ce_loss": 0.20529085397720337, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.38767898082733154, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.1801074594259262, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.7419565916061401, + "step": 12130 + }, + { + "ce_loss": 0.2174055576324463, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.35398024320602417, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.1339172124862671, + "step": 12130 + }, + { + "epoch": 4.049366244162775, + "loss": 0.9346, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "grad_norm": 1.9079539775848389, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "learning_rate": 0.0005396016163304192, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.9857970476150513, + "step": 12140 + }, + { + "ce_loss": 0.16553930938243866, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.3548365533351898, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.1572820097208023, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.8096906542778015, + "step": 12140 + }, + { + "ce_loss": 0.17616169154644012, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.3309439420700073, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.17873306572437286, + "step": 12140 + }, + { + "epoch": 4.052701801200801, + "loss": 0.8059, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "grad_norm": 1.834365725517273, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "learning_rate": 0.0005391965757780047, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.8741999268531799, + "step": 12150 + }, + { + "ce_loss": 0.2503318786621094, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.3878973722457886, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.19882416725158691, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.8098387122154236, + "step": 12150 + }, + { + "ce_loss": 0.186272531747818, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.3593500256538391, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.20150873064994812, + "step": 12150 + }, + { + "epoch": 4.056037358238826, + "loss": 0.9423, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "grad_norm": 1.893646001815796, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "learning_rate": 0.0005387913727682081, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.737841784954071, + "step": 12160 + }, + { + "ce_loss": 0.18643051385879517, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.3098827004432678, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.14226196706295013, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.9402210712432861, + "step": 12160 + }, + { + "ce_loss": 0.21556143462657928, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.3903096318244934, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.17290903627872467, + "step": 12160 + }, + { + "epoch": 4.059372915276851, + "loss": 0.8178, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "grad_norm": 3.435178279876709, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "learning_rate": 0.0005383860077739448, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.7845951914787292, + "step": 12170 + }, + { + "ce_loss": 0.22703777253627777, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.40096449851989746, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.15633171796798706, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.9148541688919067, + "step": 12170 + }, + { + "ce_loss": 0.30216914415359497, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.3802482485771179, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.17897044122219086, + "step": 12170 + }, + { + "epoch": 4.062708472314877, + "loss": 0.892, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "grad_norm": 1.3513412475585938, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "learning_rate": 0.0005379804812683194, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 1.0374103784561157, + "step": 12180 + }, + { + "ce_loss": 0.2488425225019455, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.3943198621273041, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.14718931913375854, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 0.6714553833007812, + "step": 12180 + }, + { + "ce_loss": 0.19426223635673523, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.3287561237812042, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.14829982817173004, + "step": 12180 + }, + { + "epoch": 4.066044029352902, + "loss": 0.8747, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "grad_norm": 2.586073637008667, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "learning_rate": 0.0005375747937246253, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.800316333770752, + "step": 12190 + }, + { + "ce_loss": 0.2151237428188324, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.34133580327033997, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.1352153867483139, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.6638113856315613, + "step": 12190 + }, + { + "ce_loss": 0.1923898607492447, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.2985258102416992, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.12334320694208145, + "step": 12190 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.8444, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "grad_norm": 2.256270408630371, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "learning_rate": 0.0005371689456163431, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.7758722305297852, + "step": 12200 + }, + { + "ce_loss": 0.20380429923534393, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.38605189323425293, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.15181048214435577, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 1.1506251096725464, + "step": 12200 + }, + { + "ce_loss": 0.27955031394958496, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.46950602531433105, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.20281149446964264, + "step": 12200 + }, + { + "epoch": 4.072715143428953, + "loss": 0.8862, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "grad_norm": 2.828490734100342, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "learning_rate": 0.0005367629374171415, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.8178892135620117, + "step": 12210 + }, + { + "ce_loss": 0.18405741453170776, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.37577715516090393, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.13698050379753113, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.7907006740570068, + "step": 12210 + }, + { + "ce_loss": 0.2112656682729721, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.3416105806827545, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.18553809821605682, + "step": 12210 + }, + { + "epoch": 4.076050700466978, + "loss": 0.8332, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "grad_norm": 1.8846862316131592, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "learning_rate": 0.0005363567696008755, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.8879470825195312, + "step": 12220 + }, + { + "ce_loss": 0.20231954753398895, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.34891757369041443, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.1723923534154892, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.8345476984977722, + "step": 12220 + }, + { + "ce_loss": 0.22824759781360626, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.39881062507629395, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.2072669118642807, + "step": 12220 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.9121, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "grad_norm": 1.5614279508590698, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "learning_rate": 0.0005359504426415869, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.9222154021263123, + "step": 12230 + }, + { + "ce_loss": 0.21302418410778046, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.3472989499568939, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.19184233248233795, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.854812741279602, + "step": 12230 + }, + { + "ce_loss": 0.20118644833564758, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.375763475894928, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.17603860795497894, + "step": 12230 + }, + { + "epoch": 4.082721814543029, + "loss": 0.8894, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "grad_norm": 2.552529811859131, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "learning_rate": 0.0005355439570135028, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 0.8024995923042297, + "step": 12240 + }, + { + "ce_loss": 0.1890585571527481, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.43341967463493347, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.1784624308347702, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 1.153965950012207, + "step": 12240 + }, + { + "ce_loss": 0.15036635100841522, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.29220816493034363, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.12937654554843903, + "step": 12240 + }, + { + "epoch": 4.086057371581054, + "loss": 0.9087, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "grad_norm": 1.2219160795211792, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "learning_rate": 0.0005351373131910357, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.8469090461730957, + "step": 12250 + }, + { + "ce_loss": 0.21229684352874756, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.4633750915527344, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.17109663784503937, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.8332298398017883, + "step": 12250 + }, + { + "ce_loss": 0.231580451130867, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.4204753637313843, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.18100927770137787, + "step": 12250 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.8893, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "grad_norm": 3.462394952774048, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "learning_rate": 0.0005347305116487827, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 1.1916685104370117, + "step": 12260 + }, + { + "ce_loss": 0.2497703731060028, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.43108367919921875, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.20356877148151398, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.8169243931770325, + "step": 12260 + }, + { + "ce_loss": 0.24040541052818298, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.36889344453811646, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.15058249235153198, + "step": 12260 + }, + { + "epoch": 4.092728485657105, + "loss": 0.8878, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "grad_norm": 1.7535141706466675, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "learning_rate": 0.0005343235528615252, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.782029926776886, + "step": 12270 + }, + { + "ce_loss": 0.12700021266937256, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.2926868200302124, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.13709864020347595, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.62381911277771, + "step": 12270 + }, + { + "ce_loss": 0.15468086302280426, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.32623642683029175, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.14236095547676086, + "step": 12270 + }, + { + "epoch": 4.09606404269513, + "loss": 0.9293, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "grad_norm": 3.5007686614990234, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "learning_rate": 0.0005339164373042275, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.8049991130828857, + "step": 12280 + }, + { + "ce_loss": 0.19103293120861053, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.4100547432899475, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.15682145953178406, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.7409756183624268, + "step": 12280 + }, + { + "ce_loss": 0.188636913895607, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.40608739852905273, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.1458330750465393, + "step": 12280 + }, + { + "epoch": 4.099399599733156, + "loss": 1.0107, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "grad_norm": 1.8453807830810547, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "learning_rate": 0.0005335091654520374, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.9331576824188232, + "step": 12290 + }, + { + "ce_loss": 0.2545464038848877, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.45423731207847595, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.16400742530822754, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.8729333877563477, + "step": 12290 + }, + { + "ce_loss": 0.21526475250720978, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.4151415526866913, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.16812297701835632, + "step": 12290 + }, + { + "epoch": 4.102735156771181, + "loss": 0.9338, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "grad_norm": 1.6489121913909912, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "learning_rate": 0.0005331017377802853, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.7752780318260193, + "step": 12300 + }, + { + "ce_loss": 0.1960146278142929, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.3631875514984131, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.1767382174730301, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.5530449151992798, + "step": 12300 + }, + { + "ce_loss": 0.17343278229236603, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.28233903646469116, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.0964079201221466, + "step": 12300 + }, + { + "epoch": 4.106070713809206, + "loss": 0.8486, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "grad_norm": 2.788712978363037, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "learning_rate": 0.0005326941547644827, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 0.6198724508285522, + "step": 12310 + }, + { + "ce_loss": 0.15598838031291962, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.3028586208820343, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.12344849854707718, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 1.2339098453521729, + "step": 12310 + }, + { + "ce_loss": 0.2487998604774475, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.42270636558532715, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.17186760902404785, + "step": 12310 + }, + { + "epoch": 4.109406270847232, + "loss": 0.8935, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "grad_norm": 2.3489551544189453, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "learning_rate": 0.0005322864168803231, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 1.010440468788147, + "step": 12320 + }, + { + "ce_loss": 0.24523364007472992, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.4519507586956024, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.17541134357452393, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 0.7359771728515625, + "step": 12320 + }, + { + "ce_loss": 0.17784751951694489, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.3079465627670288, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.18632066249847412, + "step": 12320 + }, + { + "epoch": 4.112741827885257, + "loss": 0.8383, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "grad_norm": 2.5580828189849854, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "learning_rate": 0.0005318785246036802, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.8238269090652466, + "step": 12330 + }, + { + "ce_loss": 0.2343672662973404, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.3459825813770294, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.19455063343048096, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.738259494304657, + "step": 12330 + }, + { + "ce_loss": 0.20209765434265137, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.40758582949638367, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.1284421682357788, + "step": 12330 + }, + { + "epoch": 4.116077384923282, + "loss": 0.9199, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "grad_norm": 2.166008710861206, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "learning_rate": 0.0005314704784106086, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 1.1044200658798218, + "step": 12340 + }, + { + "ce_loss": 0.3675804138183594, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.4157879948616028, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.20491206645965576, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 0.7513501644134521, + "step": 12340 + }, + { + "ce_loss": 0.17775169014930725, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.3683262765407562, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.14589950442314148, + "step": 12340 + }, + { + "epoch": 4.119412941961308, + "loss": 0.9352, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "grad_norm": 2.094301223754883, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "learning_rate": 0.0005310622787773417, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.8135566115379333, + "step": 12350 + }, + { + "ce_loss": 0.23291337490081787, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.3937142789363861, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.13678307831287384, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.6764379143714905, + "step": 12350 + }, + { + "ce_loss": 0.20164714753627777, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.3326803743839264, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.14201340079307556, + "step": 12350 + }, + { + "epoch": 4.122748498999333, + "loss": 0.8407, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "grad_norm": 1.809917688369751, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "learning_rate": 0.0005306539261802928, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.784867525100708, + "step": 12360 + }, + { + "ce_loss": 0.17784570157527924, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.37807798385620117, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.17634142935276031, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.7875006794929504, + "step": 12360 + }, + { + "ce_loss": 0.20042015612125397, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.3755456507205963, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.1658916175365448, + "step": 12360 + }, + { + "epoch": 4.126084056037358, + "loss": 0.8896, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "grad_norm": 2.7777342796325684, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "learning_rate": 0.0005302454210960529, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.6223516464233398, + "step": 12370 + }, + { + "ce_loss": 0.10979112982749939, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.34536290168762207, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.1221790686249733, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.908021867275238, + "step": 12370 + }, + { + "ce_loss": 0.20778703689575195, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.44324469566345215, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.19053210318088531, + "step": 12370 + }, + { + "epoch": 4.129419613075384, + "loss": 0.8902, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "grad_norm": 1.8988518714904785, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "learning_rate": 0.0005298367640013918, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.7390773892402649, + "step": 12380 + }, + { + "ce_loss": 0.20272931456565857, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.34763336181640625, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.15068559348583221, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.8524680733680725, + "step": 12380 + }, + { + "ce_loss": 0.27532801032066345, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.3684280514717102, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.19492267072200775, + "step": 12380 + }, + { + "epoch": 4.132755170113409, + "loss": 0.8947, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "grad_norm": 2.2255940437316895, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "learning_rate": 0.0005294279553732558, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.8695392608642578, + "step": 12390 + }, + { + "ce_loss": 0.23825789988040924, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.4132382869720459, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.16632671654224396, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.8631477355957031, + "step": 12390 + }, + { + "ce_loss": 0.20770201086997986, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.37137627601623535, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.2013377696275711, + "step": 12390 + }, + { + "epoch": 4.136090727151434, + "loss": 0.8487, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "grad_norm": 1.5887508392333984, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "learning_rate": 0.0005290189956887691, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.8215475082397461, + "step": 12400 + }, + { + "ce_loss": 0.20373262465000153, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.38555386662483215, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.17290176451206207, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.6696367263793945, + "step": 12400 + }, + { + "ce_loss": 0.17253293097019196, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.3173353672027588, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.14702731370925903, + "step": 12400 + }, + { + "epoch": 4.13942628418946, + "loss": 0.8461, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "grad_norm": 1.7948837280273438, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "learning_rate": 0.0005286098854252313, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.807908833026886, + "step": 12410 + }, + { + "ce_loss": 0.16164608299732208, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.3655582368373871, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.13646908104419708, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.911085307598114, + "step": 12410 + }, + { + "ce_loss": 0.25217583775520325, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.3669683635234833, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.1766831874847412, + "step": 12410 + }, + { + "epoch": 4.142761841227485, + "loss": 0.9583, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "grad_norm": 1.9843021631240845, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "learning_rate": 0.0005282006250601183, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.7271966934204102, + "step": 12420 + }, + { + "ce_loss": 0.2506415843963623, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.3042491674423218, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.1721765249967575, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.9262433052062988, + "step": 12420 + }, + { + "ce_loss": 0.19725489616394043, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.3291471004486084, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.15821436047554016, + "step": 12420 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.8732, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "grad_norm": 1.9727813005447388, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "learning_rate": 0.0005277912150710808, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.7784503698348999, + "step": 12430 + }, + { + "ce_loss": 0.1683514267206192, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.33196163177490234, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.16521677374839783, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.6911976933479309, + "step": 12430 + }, + { + "ce_loss": 0.15645936131477356, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.3178960382938385, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.1477438360452652, + "step": 12430 + }, + { + "epoch": 4.149432955303536, + "loss": 0.8732, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "grad_norm": 5.099236488342285, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "learning_rate": 0.0005273816559359444, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.764206051826477, + "step": 12440 + }, + { + "ce_loss": 0.14106637239456177, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.35556140542030334, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.13623017072677612, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.6508702635765076, + "step": 12440 + }, + { + "ce_loss": 0.13022911548614502, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.3620184659957886, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.11632784456014633, + "step": 12440 + }, + { + "epoch": 4.152768512341561, + "loss": 0.8843, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "grad_norm": 2.6267311573028564, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "learning_rate": 0.0005269719481327087, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 0.6806822419166565, + "step": 12450 + }, + { + "ce_loss": 0.15208680927753448, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.3680131137371063, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.16049380600452423, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 1.2068417072296143, + "step": 12450 + }, + { + "ce_loss": 0.1920844465494156, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.3374309539794922, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.18087457120418549, + "step": 12450 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.8568, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "grad_norm": 1.804216742515564, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "learning_rate": 0.0005265620921395469, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.8538085222244263, + "step": 12460 + }, + { + "ce_loss": 0.25904232263565063, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.35094770789146423, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.1721329391002655, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 1.1678739786148071, + "step": 12460 + }, + { + "ce_loss": 0.2640455365180969, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.39140695333480835, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.1936824917793274, + "step": 12460 + }, + { + "epoch": 4.159439626417612, + "loss": 0.9146, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "grad_norm": 1.7934942245483398, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "learning_rate": 0.0005261520884348048, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 1.1549052000045776, + "step": 12470 + }, + { + "ce_loss": 0.3144696354866028, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.4195968508720398, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.18539300560951233, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 1.1561297178268433, + "step": 12470 + }, + { + "ce_loss": 0.30278608202934265, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.47233226895332336, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.21444253623485565, + "step": 12470 + }, + { + "epoch": 4.162775183455637, + "loss": 1.0016, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "grad_norm": 2.2133514881134033, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "learning_rate": 0.0005257419374970012, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.9485584497451782, + "step": 12480 + }, + { + "ce_loss": 0.14209671318531036, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.43666237592697144, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.16533339023590088, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.7782878875732422, + "step": 12480 + }, + { + "ce_loss": 0.18979869782924652, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.40814101696014404, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.14509843289852142, + "step": 12480 + }, + { + "epoch": 4.166110740493663, + "loss": 0.8745, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "grad_norm": 1.8272050619125366, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "learning_rate": 0.0005253316398048258, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.8993416428565979, + "step": 12490 + }, + { + "ce_loss": 0.2716492712497711, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.38575854897499084, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.19661776721477509, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.8288870453834534, + "step": 12490 + }, + { + "ce_loss": 0.2048046737909317, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.3267305791378021, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.15473157167434692, + "step": 12490 + }, + { + "epoch": 4.169446297531688, + "loss": 0.8546, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "grad_norm": 1.6469388008117676, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "learning_rate": 0.0005249211958371406, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.9824590086936951, + "step": 12500 + }, + { + "ce_loss": 0.2935064435005188, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.39997178316116333, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.21537864208221436, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.7035849690437317, + "step": 12500 + }, + { + "ce_loss": 0.20198889076709747, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.3350256383419037, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.16631345450878143, + "step": 12500 + }, + { + "epoch": 4.172781854569713, + "loss": 0.9153, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "grad_norm": 3.0709216594696045, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "learning_rate": 0.000524510606072978, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.7076886296272278, + "step": 12510 + }, + { + "ce_loss": 0.17329534888267517, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.34127768874168396, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.1456620842218399, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.6534823775291443, + "step": 12510 + }, + { + "ce_loss": 0.18447883427143097, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.29903972148895264, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.16942507028579712, + "step": 12510 + }, + { + "epoch": 4.176117411607739, + "loss": 0.8183, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "grad_norm": 1.55356764793396, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "learning_rate": 0.00052409987099154, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.7567281723022461, + "step": 12520 + }, + { + "ce_loss": 0.17792902886867523, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.3390805721282959, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.16375339031219482, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.6664754748344421, + "step": 12520 + }, + { + "ce_loss": 0.20032376050949097, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.32088297605514526, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.120712511241436, + "step": 12520 + }, + { + "epoch": 4.179452968645764, + "loss": 0.7824, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "grad_norm": 1.796610713005066, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "learning_rate": 0.0005236889910721989, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.7799438238143921, + "step": 12530 + }, + { + "ce_loss": 0.23633547127246857, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.35754677653312683, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.1504986733198166, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.5622330904006958, + "step": 12530 + }, + { + "ce_loss": 0.132956400513649, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.27733558416366577, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.15136227011680603, + "step": 12530 + }, + { + "epoch": 4.182788525683789, + "loss": 0.7844, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "grad_norm": 1.6813994646072388, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "learning_rate": 0.0005232779667944959, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 1.1557304859161377, + "step": 12540 + }, + { + "ce_loss": 0.19993092119693756, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.4479179084300995, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.15367430448532104, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 0.7927672266960144, + "step": 12540 + }, + { + "ce_loss": 0.26575636863708496, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.3921426832675934, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.1343362182378769, + "step": 12540 + }, + { + "epoch": 4.186124082721815, + "loss": 0.868, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "grad_norm": 1.5843020677566528, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "learning_rate": 0.0005228667986381402, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.9517365097999573, + "step": 12550 + }, + { + "ce_loss": 0.22790701687335968, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.42120158672332764, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.1550002545118332, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.7985783219337463, + "step": 12550 + }, + { + "ce_loss": 0.23687949776649475, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.30934327840805054, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.19739030301570892, + "step": 12550 + }, + { + "epoch": 4.18945963975984, + "loss": 0.9034, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "grad_norm": 1.9663341045379639, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "learning_rate": 0.0005224554870830095, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.7035840153694153, + "step": 12560 + }, + { + "ce_loss": 0.21058326959609985, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.2658486068248749, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.1693160980939865, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.8260355591773987, + "step": 12560 + }, + { + "ce_loss": 0.22241933643817902, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.32658758759498596, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.1754622608423233, + "step": 12560 + }, + { + "epoch": 4.192795196797865, + "loss": 0.8248, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "grad_norm": 1.6988965272903442, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "learning_rate": 0.0005220440326091486, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.8381247520446777, + "step": 12570 + }, + { + "ce_loss": 0.24428690969944, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.3550545573234558, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.17451095581054688, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.5025757551193237, + "step": 12570 + }, + { + "ce_loss": 0.12798961997032166, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.20385515689849854, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.12277933955192566, + "step": 12570 + }, + { + "epoch": 4.196130753835891, + "loss": 0.8421, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "grad_norm": 2.9220128059387207, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "learning_rate": 0.0005216324356967692, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.8571416139602661, + "step": 12580 + }, + { + "ce_loss": 0.27399173378944397, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.3700946271419525, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.16993477940559387, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.9421014785766602, + "step": 12580 + }, + { + "ce_loss": 0.2747233510017395, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.3414028286933899, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.1843489110469818, + "step": 12580 + }, + { + "epoch": 4.199466310873916, + "loss": 0.9192, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "grad_norm": 1.5139302015304565, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "learning_rate": 0.0005212206968262492, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 0.7501705884933472, + "step": 12590 + }, + { + "ce_loss": 0.21901053190231323, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.3573254942893982, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.17358480393886566, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 1.0535688400268555, + "step": 12590 + }, + { + "ce_loss": 0.23637911677360535, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.3890411853790283, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.1661222130060196, + "step": 12590 + }, + { + "epoch": 4.202801867911941, + "loss": 0.9307, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "grad_norm": 4.023229122161865, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "learning_rate": 0.0005208088164781322, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.8946199417114258, + "step": 12600 + }, + { + "ce_loss": 0.20067694783210754, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.369188517332077, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.14849264919757843, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.7389215230941772, + "step": 12600 + }, + { + "ce_loss": 0.1676078885793686, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.3055753707885742, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.17706617712974548, + "step": 12600 + }, + { + "epoch": 4.206137424949967, + "loss": 0.8706, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "grad_norm": 2.451463222503662, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "learning_rate": 0.0005203967951331266, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.9454283714294434, + "step": 12610 + }, + { + "ce_loss": 0.28624972701072693, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.4206025004386902, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.1610059291124344, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.8937854766845703, + "step": 12610 + }, + { + "ce_loss": 0.19485153257846832, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.4219454228878021, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.1623564213514328, + "step": 12610 + }, + { + "epoch": 4.209472981987992, + "loss": 0.9549, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "grad_norm": 1.8533005714416504, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "learning_rate": 0.0005199846332721059, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 1.1628186702728271, + "step": 12620 + }, + { + "ce_loss": 0.2804311513900757, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.44220656156539917, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.21410658955574036, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 0.6214933395385742, + "step": 12620 + }, + { + "ce_loss": 0.13951043784618378, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.3288695514202118, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.11865205317735672, + "step": 12620 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.9514, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "grad_norm": 3.439215660095215, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "learning_rate": 0.0005195723313761074, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.7460795640945435, + "step": 12630 + }, + { + "ce_loss": 0.193592369556427, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.3062877655029297, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.14694787561893463, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.9722935557365417, + "step": 12630 + }, + { + "ce_loss": 0.21647977828979492, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.4791179597377777, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.17317292094230652, + "step": 12630 + }, + { + "epoch": 4.216144096064043, + "loss": 0.8806, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "grad_norm": 1.7811447381973267, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "learning_rate": 0.0005191598899263315, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.9708473086357117, + "step": 12640 + }, + { + "ce_loss": 0.20301295816898346, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.38349807262420654, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.23510898649692535, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.7471814751625061, + "step": 12640 + }, + { + "ce_loss": 0.1882941722869873, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.3489193022251129, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.18988509476184845, + "step": 12640 + }, + { + "epoch": 4.219479653102068, + "loss": 0.814, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "grad_norm": 2.196380138397217, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "learning_rate": 0.0005187473094041421, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.7829256653785706, + "step": 12650 + }, + { + "ce_loss": 0.19593583047389984, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.35681459307670593, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.1733783781528473, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.8933637738227844, + "step": 12650 + }, + { + "ce_loss": 0.2182813584804535, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.3945019841194153, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.14737632870674133, + "step": 12650 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.7941, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "grad_norm": 2.7803497314453125, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "learning_rate": 0.0005183345902910646, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.663300096988678, + "step": 12660 + }, + { + "ce_loss": 0.15582312643527985, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.29122138023376465, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.16431498527526855, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.893598198890686, + "step": 12660 + }, + { + "ce_loss": 0.24435116350650787, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.31676265597343445, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.18816092610359192, + "step": 12660 + }, + { + "epoch": 4.226150767178119, + "loss": 0.8823, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "grad_norm": 1.656362533569336, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "learning_rate": 0.0005179217330687872, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.9187933206558228, + "step": 12670 + }, + { + "ce_loss": 0.23749017715454102, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.3897465765476227, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.15931007266044617, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.8433822393417358, + "step": 12670 + }, + { + "ce_loss": 0.25055891275405884, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.3643941879272461, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.1786240190267563, + "step": 12670 + }, + { + "epoch": 4.229486324216144, + "loss": 0.8254, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "grad_norm": 1.8392776250839233, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "learning_rate": 0.0005175087382191583, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.9356280565261841, + "step": 12680 + }, + { + "ce_loss": 0.24201759696006775, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.3536796569824219, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.17648980021476746, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.8160145878791809, + "step": 12680 + }, + { + "ce_loss": 0.23435355722904205, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.28745996952056885, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.18511168658733368, + "step": 12680 + }, + { + "epoch": 4.23282188125417, + "loss": 0.8672, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "grad_norm": 1.6637465953826904, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "learning_rate": 0.0005170956062241875, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.8128212690353394, + "step": 12690 + }, + { + "ce_loss": 0.2414637804031372, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.362344354391098, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.17531849443912506, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.8842551112174988, + "step": 12690 + }, + { + "ce_loss": 0.269711434841156, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.369120717048645, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.1902657300233841, + "step": 12690 + }, + { + "epoch": 4.236157438292195, + "loss": 0.8807, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "grad_norm": 1.8229620456695557, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "learning_rate": 0.0005166823375660441, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 1.0383368730545044, + "step": 12700 + }, + { + "ce_loss": 0.22843684256076813, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.37846410274505615, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.21161183714866638, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 0.7493757605552673, + "step": 12700 + }, + { + "ce_loss": 0.17815834283828735, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.3474876582622528, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.18813316524028778, + "step": 12700 + }, + { + "epoch": 4.23949299533022, + "loss": 0.9174, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "grad_norm": 1.520404577255249, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "learning_rate": 0.0005162689327270573, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 1.1266505718231201, + "step": 12710 + }, + { + "ce_loss": 0.24157746136188507, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.4278128147125244, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.18293094635009766, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 0.9948121905326843, + "step": 12710 + }, + { + "ce_loss": 0.2826690971851349, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.4682421088218689, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.19122135639190674, + "step": 12710 + }, + { + "epoch": 4.242828552368246, + "loss": 0.8706, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "grad_norm": 2.3407678604125977, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "learning_rate": 0.0005158553921897149, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 1.2130777835845947, + "step": 12720 + }, + { + "ce_loss": 0.16182638704776764, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.3381054103374481, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.154271200299263, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 0.8921987414360046, + "step": 12720 + }, + { + "ce_loss": 0.21352069079875946, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.42966151237487793, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.20217864215373993, + "step": 12720 + }, + { + "epoch": 4.246164109406271, + "loss": 0.8674, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "grad_norm": 3.0453741550445557, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "learning_rate": 0.0005154417164366633, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 1.0986595153808594, + "step": 12730 + }, + { + "ce_loss": 0.22912782430648804, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.3212457597255707, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.15528671443462372, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 0.7199212908744812, + "step": 12730 + }, + { + "ce_loss": 0.2046150267124176, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.33154305815696716, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.12476657330989838, + "step": 12730 + }, + { + "epoch": 4.249499666444296, + "loss": 0.8704, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "grad_norm": 1.2896811962127686, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "learning_rate": 0.0005150279059507065, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.7782720327377319, + "step": 12740 + }, + { + "ce_loss": 0.1771492063999176, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.30043888092041016, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.174044668674469, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.6854279041290283, + "step": 12740 + }, + { + "ce_loss": 0.1830480545759201, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.31467360258102417, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.1465923935174942, + "step": 12740 + }, + { + "epoch": 4.252835223482322, + "loss": 0.8925, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "grad_norm": 1.632073163986206, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "learning_rate": 0.0005146139612148061, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.8414084315299988, + "step": 12750 + }, + { + "ce_loss": 0.2268008589744568, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.34458792209625244, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.15668857097625732, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.6938906908035278, + "step": 12750 + }, + { + "ce_loss": 0.17403708398342133, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.3637546896934509, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.15595611929893494, + "step": 12750 + }, + { + "epoch": 4.256170780520347, + "loss": 0.8466, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "grad_norm": 1.6854667663574219, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "learning_rate": 0.0005141998827120799, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.804252564907074, + "step": 12760 + }, + { + "ce_loss": 0.25367993116378784, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.3248867690563202, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.18379093706607819, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.9785430431365967, + "step": 12760 + }, + { + "ce_loss": 0.29850438237190247, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.3835088610649109, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.1554730236530304, + "step": 12760 + }, + { + "epoch": 4.259506337558372, + "loss": 0.9539, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "grad_norm": 2.00483775138855, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "learning_rate": 0.0005137856709258021, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.759904682636261, + "step": 12770 + }, + { + "ce_loss": 0.22166599333286285, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.3422871232032776, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.194756418466568, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.7452161908149719, + "step": 12770 + }, + { + "ce_loss": 0.1742466241121292, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.2873826026916504, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.1804172545671463, + "step": 12770 + }, + { + "epoch": 4.262841894596398, + "loss": 0.9132, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "grad_norm": 1.7677546739578247, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "learning_rate": 0.0005133713263394025, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 1.2018852233886719, + "step": 12780 + }, + { + "ce_loss": 0.2658690810203552, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.4588566720485687, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.20907336473464966, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 0.7002390027046204, + "step": 12780 + }, + { + "ce_loss": 0.19882053136825562, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.3090733587741852, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.1564292162656784, + "step": 12780 + }, + { + "epoch": 4.266177451634423, + "loss": 0.9217, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "grad_norm": 2.054368019104004, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "learning_rate": 0.000512956849436466, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.8584941625595093, + "step": 12790 + }, + { + "ce_loss": 0.22313432395458221, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.40078526735305786, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.19047851860523224, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.96540367603302, + "step": 12790 + }, + { + "ce_loss": 0.22203348577022552, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.4080057144165039, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.22590765357017517, + "step": 12790 + }, + { + "epoch": 4.269513008672448, + "loss": 0.8961, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "grad_norm": 1.797431468963623, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "learning_rate": 0.0005125422407007313, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.6185550689697266, + "step": 12800 + }, + { + "ce_loss": 0.17735739052295685, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.24161382019519806, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.14409980177879333, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.7940037250518799, + "step": 12800 + }, + { + "ce_loss": 0.2755930721759796, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.35304659605026245, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.1652856320142746, + "step": 12800 + }, + { + "epoch": 4.272848565710474, + "loss": 0.8813, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "grad_norm": 2.873664140701294, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "learning_rate": 0.0005121275006160918, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 1.2027432918548584, + "step": 12810 + }, + { + "ce_loss": 0.24637238681316376, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.5167804956436157, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.19930368661880493, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 0.8857278227806091, + "step": 12810 + }, + { + "ce_loss": 0.19141751527786255, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.424816370010376, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.16012920439243317, + "step": 12810 + }, + { + "epoch": 4.276184122748499, + "loss": 0.8761, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "grad_norm": 2.463259220123291, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "learning_rate": 0.0005117126296665935, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.8840368986129761, + "step": 12820 + }, + { + "ce_loss": 0.23989543318748474, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.35038143396377563, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.1598270982503891, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.6414638161659241, + "step": 12820 + }, + { + "ce_loss": 0.1843615025281906, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.2808043956756592, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.13928772509098053, + "step": 12820 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.8466, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "grad_norm": 1.7067866325378418, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "learning_rate": 0.0005112976283364358, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.7968407869338989, + "step": 12830 + }, + { + "ce_loss": 0.20776928961277008, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.2952131927013397, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.21513982117176056, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 1.7987573146820068, + "step": 12830 + }, + { + "ce_loss": 0.2212960422039032, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.34988850355148315, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.17741668224334717, + "step": 12830 + }, + { + "epoch": 4.28285523682455, + "loss": 0.8365, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "grad_norm": 2.650939464569092, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "learning_rate": 0.0005108824971099697, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.819025456905365, + "step": 12840 + }, + { + "ce_loss": 0.17640937864780426, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.2923857569694519, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.17533421516418457, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.8999745845794678, + "step": 12840 + }, + { + "ce_loss": 0.21573689579963684, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.43884068727493286, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.20454314351081848, + "step": 12840 + }, + { + "epoch": 4.286190793862575, + "loss": 0.8693, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "grad_norm": 2.5925934314727783, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "learning_rate": 0.0005104672364716979, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 1.0080406665802002, + "step": 12850 + }, + { + "ce_loss": 0.2583248019218445, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.39177629351615906, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.2107367366552353, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 1.320812463760376, + "step": 12850 + }, + { + "ce_loss": 0.23779240250587463, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.35576707124710083, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.16666685044765472, + "step": 12850 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.9322, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "grad_norm": 1.8717230558395386, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "learning_rate": 0.0005100518469062745, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.907106339931488, + "step": 12860 + }, + { + "ce_loss": 0.26588577032089233, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.4072806239128113, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.19155624508857727, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 1.1353204250335693, + "step": 12860 + }, + { + "ce_loss": 0.308605432510376, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.45205217599868774, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.22317735850811005, + "step": 12860 + }, + { + "epoch": 4.292861907938626, + "loss": 0.895, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "grad_norm": 1.8743352890014648, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "learning_rate": 0.0005096363288985035, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.9951395988464355, + "step": 12870 + }, + { + "ce_loss": 0.26687008142471313, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.44366684556007385, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.16680413484573364, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.9308007955551147, + "step": 12870 + }, + { + "ce_loss": 0.21052157878875732, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.3582399785518646, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.18249179422855377, + "step": 12870 + }, + { + "epoch": 4.296197464976651, + "loss": 0.8788, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "grad_norm": 1.6709355115890503, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "learning_rate": 0.0005092206829333394, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 0.7861965298652649, + "step": 12880 + }, + { + "ce_loss": 0.17194722592830658, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.45903676748275757, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.15497665107250214, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 1.0698442459106445, + "step": 12880 + }, + { + "ce_loss": 0.1924717128276825, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.3156096339225769, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.13832615315914154, + "step": 12880 + }, + { + "epoch": 4.299533022014677, + "loss": 0.9493, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "grad_norm": 1.8523396253585815, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "learning_rate": 0.0005088049094958858, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 1.0296885967254639, + "step": 12890 + }, + { + "ce_loss": 0.22927792370319366, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.3084159195423126, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.1630437821149826, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 0.8998898863792419, + "step": 12890 + }, + { + "ce_loss": 0.23149771988391876, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.3336685597896576, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.21322858333587646, + "step": 12890 + }, + { + "epoch": 4.302868579052702, + "loss": 0.882, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "grad_norm": 1.5939126014709473, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "learning_rate": 0.0005083890090713949, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 1.1143803596496582, + "step": 12900 + }, + { + "ce_loss": 0.2878939211368561, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.4585745632648468, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.2042665034532547, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 0.6540172696113586, + "step": 12900 + }, + { + "ce_loss": 0.19547341763973236, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.30709272623062134, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.1513615995645523, + "step": 12900 + }, + { + "epoch": 4.306204136090727, + "loss": 0.863, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "grad_norm": 3.067009687423706, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "learning_rate": 0.0005079729821452671, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.7821261286735535, + "step": 12910 + }, + { + "ce_loss": 0.22335682809352875, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.3630492091178894, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.19557523727416992, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.8023974299430847, + "step": 12910 + }, + { + "ce_loss": 0.19819621741771698, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.36896613240242004, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.17763593792915344, + "step": 12910 + }, + { + "epoch": 4.309539693128753, + "loss": 0.8432, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "grad_norm": 1.4734580516815186, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "learning_rate": 0.000507556829203051, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.9618943929672241, + "step": 12920 + }, + { + "ce_loss": 0.2590271532535553, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.3850080966949463, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.16762995719909668, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.6719835996627808, + "step": 12920 + }, + { + "ce_loss": 0.1529945433139801, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.3388926088809967, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.1480969339609146, + "step": 12920 + }, + { + "epoch": 4.312875250166778, + "loss": 0.9353, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "grad_norm": 1.7050225734710693, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "learning_rate": 0.0005071405507304414, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 1.1230779886245728, + "step": 12930 + }, + { + "ce_loss": 0.19990414381027222, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.3488672971725464, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.1832718402147293, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 0.8064915537834167, + "step": 12930 + }, + { + "ce_loss": 0.23185153305530548, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.330584317445755, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.22238650918006897, + "step": 12930 + }, + { + "epoch": 4.316210807204803, + "loss": 0.8709, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "grad_norm": 3.6646835803985596, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "learning_rate": 0.0005067241472132805, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.8757513761520386, + "step": 12940 + }, + { + "ce_loss": 0.2661345601081848, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.4074326157569885, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.2019917368888855, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.8526966571807861, + "step": 12940 + }, + { + "ce_loss": 0.22637148201465607, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.37172484397888184, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.18659645318984985, + "step": 12940 + }, + { + "epoch": 4.319546364242829, + "loss": 0.9024, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "grad_norm": 2.06301212310791, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "learning_rate": 0.0005063076191375556, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.8412178754806519, + "step": 12950 + }, + { + "ce_loss": 0.20359109342098236, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.30335208773612976, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.16786359250545502, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.6845080852508545, + "step": 12950 + }, + { + "ce_loss": 0.1543601006269455, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.3259199559688568, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.15403856337070465, + "step": 12950 + }, + { + "epoch": 4.322881921280854, + "loss": 0.831, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "grad_norm": 1.7822954654693604, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "learning_rate": 0.0005058909669894002, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 1.1513652801513672, + "step": 12960 + }, + { + "ce_loss": 0.2275664210319519, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.3893887400627136, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.16971373558044434, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 0.8988643288612366, + "step": 12960 + }, + { + "ce_loss": 0.26751142740249634, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.4451296329498291, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.13988979160785675, + "step": 12960 + }, + { + "epoch": 4.326217478318879, + "loss": 0.9181, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "grad_norm": 1.441675066947937, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "learning_rate": 0.0005054741912550918, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.9492163062095642, + "step": 12970 + }, + { + "ce_loss": 0.18496477603912354, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.5061658620834351, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.181679368019104, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.9847216606140137, + "step": 12970 + }, + { + "ce_loss": 0.15506592392921448, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.3762945830821991, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.19500184059143066, + "step": 12970 + }, + { + "epoch": 4.329553035356905, + "loss": 0.8201, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "grad_norm": 2.2929203510284424, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "learning_rate": 0.0005050572924210528, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 1.3470618724822998, + "step": 12980 + }, + { + "ce_loss": 0.2621327340602875, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.4442044794559479, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.19769565761089325, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 0.931287407875061, + "step": 12980 + }, + { + "ce_loss": 0.23502033948898315, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.39018332958221436, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.17888310551643372, + "step": 12980 + }, + { + "epoch": 4.33288859239493, + "loss": 0.9278, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "grad_norm": 2.8423891067504883, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "learning_rate": 0.0005046402709738489, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 1.4319112300872803, + "step": 12990 + }, + { + "ce_loss": 0.20456096529960632, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.3574753999710083, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.16366833448410034, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 0.6514447331428528, + "step": 12990 + }, + { + "ce_loss": 0.16469235718250275, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.2948162257671356, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.14291289448738098, + "step": 12990 + }, + { + "epoch": 4.336224149432955, + "loss": 0.9135, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "grad_norm": 1.7230727672576904, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "learning_rate": 0.0005042231274001891, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.6460437774658203, + "step": 13000 + }, + { + "ce_loss": 0.15081174671649933, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.3070511519908905, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.1417672336101532, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.9081906080245972, + "step": 13000 + }, + { + "ce_loss": 0.23736201226711273, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.3211541175842285, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.18557564914226532, + "step": 13000 + }, + { + "epoch": 4.339559706470981, + "loss": 0.7863, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "grad_norm": 2.5761780738830566, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "learning_rate": 0.0005038058621869246, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.7194321155548096, + "step": 13010 + }, + { + "ce_loss": 0.1710882931947708, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.35647034645080566, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.19168949127197266, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.8294156193733215, + "step": 13010 + }, + { + "ce_loss": 0.23769475519657135, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.3103196620941162, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.1724769026041031, + "step": 13010 + }, + { + "epoch": 4.342895263509006, + "loss": 0.9211, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "grad_norm": 1.8360869884490967, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "learning_rate": 0.000503388475821049, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.8466196060180664, + "step": 13020 + }, + { + "ce_loss": 0.2331269234418869, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.367826372385025, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.19748416543006897, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.7141017317771912, + "step": 13020 + }, + { + "ce_loss": 0.1875707358121872, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.33911678194999695, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.15469998121261597, + "step": 13020 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.8646, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "grad_norm": 2.126154661178589, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "learning_rate": 0.0005029709687896972, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.8564475178718567, + "step": 13030 + }, + { + "ce_loss": 0.2630447447299957, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.3674970269203186, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.18027503788471222, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.8201711177825928, + "step": 13030 + }, + { + "ce_loss": 0.2565067410469055, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.41150057315826416, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.15197144448757172, + "step": 13030 + }, + { + "epoch": 4.349566377585057, + "loss": 0.92, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "grad_norm": 1.716796636581421, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "learning_rate": 0.0005025533415801446, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 0.7857934832572937, + "step": 13040 + }, + { + "ce_loss": 0.20469725131988525, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.41409963369369507, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.1668863147497177, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 1.0408332347869873, + "step": 13040 + }, + { + "ce_loss": 0.245316743850708, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.34827446937561035, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.1869659125804901, + "step": 13040 + }, + { + "epoch": 4.352901934623082, + "loss": 0.8889, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "grad_norm": 2.0432233810424805, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "learning_rate": 0.000502135594679807, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 1.0115782022476196, + "step": 13050 + }, + { + "ce_loss": 0.2615415155887604, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.42382967472076416, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.19795654714107513, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 0.7003949284553528, + "step": 13050 + }, + { + "ce_loss": 0.14454393088817596, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.34949007630348206, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.17572638392448425, + "step": 13050 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.8138, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "grad_norm": 1.5323294401168823, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "learning_rate": 0.0005017177285762404, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.7875097990036011, + "step": 13060 + }, + { + "ce_loss": 0.19186387956142426, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.33928823471069336, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.14275150001049042, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.9309220910072327, + "step": 13060 + }, + { + "ce_loss": 0.28330251574516296, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.39531224966049194, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.1772787719964981, + "step": 13060 + }, + { + "epoch": 4.359573048699133, + "loss": 0.8571, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "grad_norm": 2.4798803329467773, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "learning_rate": 0.0005012997437571392, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.854665994644165, + "step": 13070 + }, + { + "ce_loss": 0.27411356568336487, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.3180094361305237, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.19514678418636322, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.8343544602394104, + "step": 13070 + }, + { + "ce_loss": 0.23687444627285004, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.36419615149497986, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.16759879887104034, + "step": 13070 + }, + { + "epoch": 4.362908605737158, + "loss": 0.9007, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "grad_norm": 2.274061679840088, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "learning_rate": 0.0005008816407103368, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.7848496437072754, + "step": 13080 + }, + { + "ce_loss": 0.23218511044979095, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.3505823016166687, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.20187018811702728, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.7971723079681396, + "step": 13080 + }, + { + "ce_loss": 0.1814272552728653, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.38159671425819397, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.11631757766008377, + "step": 13080 + }, + { + "epoch": 4.366244162775184, + "loss": 0.8833, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "grad_norm": 3.208780288696289, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "learning_rate": 0.0005004634199238042, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.9118923544883728, + "step": 13090 + }, + { + "ce_loss": 0.24683193862438202, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.39711278676986694, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.1471678465604782, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.9972525835037231, + "step": 13090 + }, + { + "ce_loss": 0.2438964694738388, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.5098133683204651, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.19950233399868011, + "step": 13090 + }, + { + "epoch": 4.369579719813209, + "loss": 0.8771, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "grad_norm": 1.8489611148834229, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "learning_rate": 0.0005000450818856503, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.9278559684753418, + "step": 13100 + }, + { + "ce_loss": 0.2684767544269562, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.41395726799964905, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.24524687230587006, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.7405310869216919, + "step": 13100 + }, + { + "ce_loss": 0.18698789179325104, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.33907848596572876, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.164516419172287, + "step": 13100 + }, + { + "epoch": 4.372915276851234, + "loss": 0.9152, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "grad_norm": 2.1648755073547363, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "learning_rate": 0.0004996266270841207, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 0.8187690377235413, + "step": 13110 + }, + { + "ce_loss": 0.22833628952503204, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.4345163404941559, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.1556825041770935, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 1.1789733171463013, + "step": 13110 + }, + { + "ce_loss": 0.217407688498497, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.4212278127670288, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.1647656112909317, + "step": 13110 + }, + { + "epoch": 4.37625083388926, + "loss": 0.9027, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "grad_norm": 3.0077097415924072, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "learning_rate": 0.0004992080560075969, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 0.6421822309494019, + "step": 13120 + }, + { + "ce_loss": 0.16113825142383575, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.29631081223487854, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.1408051699399948, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 1.1299493312835693, + "step": 13120 + }, + { + "ce_loss": 0.1643940508365631, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.34408673644065857, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.17624755203723907, + "step": 13120 + }, + { + "epoch": 4.379586390927285, + "loss": 0.8513, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "grad_norm": 1.3924531936645508, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "learning_rate": 0.0004987893691445965, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.9080301523208618, + "step": 13130 + }, + { + "ce_loss": 0.2076728641986847, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.4430268704891205, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.147035151720047, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.7841320633888245, + "step": 13130 + }, + { + "ce_loss": 0.2148856371641159, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.3555300831794739, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.21340398490428925, + "step": 13130 + }, + { + "epoch": 4.38292194796531, + "loss": 0.8792, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "grad_norm": 1.5912320613861084, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "learning_rate": 0.0004983705669837721, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.8423344492912292, + "step": 13140 + }, + { + "ce_loss": 0.20540697872638702, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.33919230103492737, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.22945822775363922, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.7879174947738647, + "step": 13140 + }, + { + "ce_loss": 0.26239603757858276, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.3321240544319153, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.15325109660625458, + "step": 13140 + }, + { + "epoch": 4.386257505003336, + "loss": 0.8441, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "grad_norm": 2.230607509613037, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "learning_rate": 0.0004979516500139109, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.8524758815765381, + "step": 13150 + }, + { + "ce_loss": 0.26263123750686646, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.3410416543483734, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.2094476819038391, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.9680402278900146, + "step": 13150 + }, + { + "ce_loss": 0.251897931098938, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.42440447211265564, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.18267853558063507, + "step": 13150 + }, + { + "epoch": 4.389593062041361, + "loss": 0.921, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "grad_norm": 3.3034005165100098, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "learning_rate": 0.0004975326187239342, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 1.1359477043151855, + "step": 13160 + }, + { + "ce_loss": 0.15224169194698334, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.28388071060180664, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.17234744131565094, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 0.8779703378677368, + "step": 13160 + }, + { + "ce_loss": 0.2350139617919922, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.37165409326553345, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.21345072984695435, + "step": 13160 + }, + { + "epoch": 4.392928619079386, + "loss": 0.9219, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "grad_norm": 1.6124509572982788, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "learning_rate": 0.0004971134736028966, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.7697380185127258, + "step": 13170 + }, + { + "ce_loss": 0.2196720391511917, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.3732682764530182, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.17663975059986115, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.8623855113983154, + "step": 13170 + }, + { + "ce_loss": 0.23376327753067017, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.38788342475891113, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.18725626170635223, + "step": 13170 + }, + { + "epoch": 4.396264176117412, + "loss": 0.8611, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "grad_norm": 2.8501627445220947, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "learning_rate": 0.0004966942151399853, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.5740664005279541, + "step": 13180 + }, + { + "ce_loss": 0.15579481422901154, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.28095322847366333, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.1369730681180954, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.8180100321769714, + "step": 13180 + }, + { + "ce_loss": 0.23740574717521667, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.32309386134147644, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.18247680366039276, + "step": 13180 + }, + { + "epoch": 4.399599733155437, + "loss": 0.8637, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "grad_norm": 1.901659607887268, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "learning_rate": 0.0004962748438245202, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.8824871778488159, + "step": 13190 + }, + { + "ce_loss": 0.24955017864704132, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.3519171178340912, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.18414409458637238, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.7215918302536011, + "step": 13190 + }, + { + "ce_loss": 0.20866245031356812, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.3040982782840729, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.15952186286449432, + "step": 13190 + }, + { + "epoch": 4.402935290193462, + "loss": 0.8484, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "grad_norm": 1.2749011516571045, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "learning_rate": 0.0004958553601459528, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.7972582578659058, + "step": 13200 + }, + { + "ce_loss": 0.24628478288650513, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.2889348864555359, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.19017939269542694, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.8323944211006165, + "step": 13200 + }, + { + "ce_loss": 0.23087507486343384, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.39095279574394226, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.1603068858385086, + "step": 13200 + }, + { + "epoch": 4.406270847231488, + "loss": 0.8376, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "grad_norm": 1.4190902709960938, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "learning_rate": 0.0004954357645938657, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.9994043707847595, + "step": 13210 + }, + { + "ce_loss": 0.30216896533966064, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.33214259147644043, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.1510033756494522, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.9063019752502441, + "step": 13210 + }, + { + "ce_loss": 0.24156875908374786, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.2896386981010437, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.15842963755130768, + "step": 13210 + }, + { + "epoch": 4.409606404269513, + "loss": 0.8183, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "grad_norm": 2.57871675491333, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "learning_rate": 0.0004950160576579717, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.8624581098556519, + "step": 13220 + }, + { + "ce_loss": 0.2337450236082077, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.3577960729598999, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.16363263130187988, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.7249259352684021, + "step": 13220 + }, + { + "ce_loss": 0.21705181896686554, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.351639986038208, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.1266530156135559, + "step": 13220 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.8898, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "grad_norm": 2.350499391555786, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "learning_rate": 0.0004945962398281146, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.9655719995498657, + "step": 13230 + }, + { + "ce_loss": 0.3011987805366516, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.34798118472099304, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.18107043206691742, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 1.0913926362991333, + "step": 13230 + }, + { + "ce_loss": 0.33407920598983765, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.39807650446891785, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.22840847074985504, + "step": 13230 + }, + { + "epoch": 4.416277518345564, + "loss": 0.9181, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "grad_norm": 1.8426028490066528, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "learning_rate": 0.0004941763115942666, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 0.8157694339752197, + "step": 13240 + }, + { + "ce_loss": 0.23468460142612457, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.40097329020500183, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.13971978425979614, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 1.0739431381225586, + "step": 13240 + }, + { + "ce_loss": 0.19124160706996918, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.33609095215797424, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.16506049036979675, + "step": 13240 + }, + { + "epoch": 4.419613075383589, + "loss": 0.9488, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "grad_norm": 2.1189777851104736, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "learning_rate": 0.0004937562734465292, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.8339735269546509, + "step": 13250 + }, + { + "ce_loss": 0.23639149963855743, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.45191851258277893, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.14533786475658417, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.8060436248779297, + "step": 13250 + }, + { + "ce_loss": 0.24148309230804443, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.4032564163208008, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.12805211544036865, + "step": 13250 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.8565, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "grad_norm": 1.9739803075790405, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "learning_rate": 0.000493336125875132, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.9563655853271484, + "step": 13260 + }, + { + "ce_loss": 0.33908629417419434, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.41125985980033875, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.20553144812583923, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.8680077791213989, + "step": 13260 + }, + { + "ce_loss": 0.2155313342809677, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.3745126724243164, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.1538851410150528, + "step": 13260 + }, + { + "epoch": 4.42628418945964, + "loss": 0.899, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "grad_norm": 3.459533214569092, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "learning_rate": 0.0004929158693704325, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.8786895871162415, + "step": 13270 + }, + { + "ce_loss": 0.20969292521476746, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.3508089482784271, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.19444763660430908, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.8677912354469299, + "step": 13270 + }, + { + "ce_loss": 0.21154287457466125, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.29949501156806946, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.1698286235332489, + "step": 13270 + }, + { + "epoch": 4.429619746497665, + "loss": 0.9023, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "grad_norm": 2.0081629753112793, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "learning_rate": 0.0004924955044229154, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.9574787616729736, + "step": 13280 + }, + { + "ce_loss": 0.21764861047267914, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.4211297929286957, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.15459951758384705, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.9137391448020935, + "step": 13280 + }, + { + "ce_loss": 0.2617412805557251, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.4320088326931, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.17992384731769562, + "step": 13280 + }, + { + "epoch": 4.432955303535691, + "loss": 0.8723, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "grad_norm": 2.0066637992858887, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "learning_rate": 0.0004920750315231916, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.8396000862121582, + "step": 13290 + }, + { + "ce_loss": 0.1335640698671341, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.3238210380077362, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.165268212556839, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.8897786140441895, + "step": 13290 + }, + { + "ce_loss": 0.19786790013313293, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.37308627367019653, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.13037879765033722, + "step": 13290 + }, + { + "epoch": 4.436290860573716, + "loss": 0.8879, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "grad_norm": 1.530429482460022, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "learning_rate": 0.0004916544511619984, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.7941538095474243, + "step": 13300 + }, + { + "ce_loss": 0.20175831019878387, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.34787747263908386, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.17431721091270447, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.7491660714149475, + "step": 13300 + }, + { + "ce_loss": 0.22332707047462463, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.30431991815567017, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.1279830038547516, + "step": 13300 + }, + { + "epoch": 4.439626417611741, + "loss": 0.7982, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "grad_norm": 1.6158078908920288, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "learning_rate": 0.0004912337638301983, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.9183559417724609, + "step": 13310 + }, + { + "ce_loss": 0.2765417993068695, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.4219082295894623, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.18277554214000702, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.8864961266517639, + "step": 13310 + }, + { + "ce_loss": 0.2849910855293274, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.37573570013046265, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.16806437075138092, + "step": 13310 + }, + { + "epoch": 4.442961974649767, + "loss": 0.935, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "grad_norm": 1.5370193719863892, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "learning_rate": 0.0004908129700187784, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.9197893738746643, + "step": 13320 + }, + { + "ce_loss": 0.1799246072769165, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.3772495687007904, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.13473819196224213, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.7015503644943237, + "step": 13320 + }, + { + "ce_loss": 0.1640758365392685, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.34314149618148804, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.15765583515167236, + "step": 13320 + }, + { + "epoch": 4.446297531687792, + "loss": 0.7786, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "grad_norm": 1.600947618484497, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "learning_rate": 0.0004903920702188509, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 0.981595516204834, + "step": 13330 + }, + { + "ce_loss": 0.19610193371772766, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.3589596152305603, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.17293380200862885, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 1.329329013824463, + "step": 13330 + }, + { + "ce_loss": 0.24279475212097168, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.4466169774532318, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.23536382615566254, + "step": 13330 + }, + { + "epoch": 4.449633088725817, + "loss": 0.9705, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "grad_norm": 5.19002628326416, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "learning_rate": 0.0004899710649216507, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.6011701822280884, + "step": 13340 + }, + { + "ce_loss": 0.1558227241039276, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.2664649486541748, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.1448230892419815, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.7817053198814392, + "step": 13340 + }, + { + "ce_loss": 0.19946074485778809, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.39532071352005005, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.1866506189107895, + "step": 13340 + }, + { + "epoch": 4.452968645763843, + "loss": 0.841, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "grad_norm": 1.8909868001937866, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "learning_rate": 0.0004895499546185366, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.7894856929779053, + "step": 13350 + }, + { + "ce_loss": 0.21032026410102844, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.3686240315437317, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.16302137076854706, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.7908044457435608, + "step": 13350 + }, + { + "ce_loss": 0.24341550469398499, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.35720616579055786, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.13885928690433502, + "step": 13350 + }, + { + "epoch": 4.456304202801868, + "loss": 0.8926, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "grad_norm": 4.662938594818115, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "learning_rate": 0.0004891287398009894, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 1.2475157976150513, + "step": 13360 + }, + { + "ce_loss": 0.25721490383148193, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.40166175365448, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.17274345457553864, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 0.7158029675483704, + "step": 13360 + }, + { + "ce_loss": 0.15737080574035645, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.33680933713912964, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.1072361096739769, + "step": 13360 + }, + { + "epoch": 4.459639759839893, + "loss": 0.903, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "grad_norm": 1.4409809112548828, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "learning_rate": 0.0004887074209606122, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.8202242851257324, + "step": 13370 + }, + { + "ce_loss": 0.13981536030769348, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.3915843665599823, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.1265016347169876, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.6500847339630127, + "step": 13370 + }, + { + "ce_loss": 0.1637197732925415, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.3304721713066101, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.15574127435684204, + "step": 13370 + }, + { + "epoch": 4.462975316877919, + "loss": 0.8532, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "grad_norm": 1.293358564376831, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "learning_rate": 0.0004882859985891294, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 0.6703695058822632, + "step": 13380 + }, + { + "ce_loss": 0.21158580482006073, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.2534683346748352, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.14631977677345276, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 1.1632661819458008, + "step": 13380 + }, + { + "ce_loss": 0.17849712073802948, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.33899661898612976, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.16240814328193665, + "step": 13380 + }, + { + "epoch": 4.466310873915944, + "loss": 0.9454, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "grad_norm": 1.8045860528945923, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "learning_rate": 0.00048786447317838625, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 0.6276642084121704, + "step": 13390 + }, + { + "ce_loss": 0.1849830448627472, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.2701457142829895, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.17188803851604462, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 1.1554486751556396, + "step": 13390 + }, + { + "ce_loss": 0.26016736030578613, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.35298141837120056, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.20714622735977173, + "step": 13390 + }, + { + "epoch": 4.469646430953969, + "loss": 0.9343, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "grad_norm": 2.388153553009033, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "learning_rate": 0.00048744284522034845, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.5145278573036194, + "step": 13400 + }, + { + "ce_loss": 0.1189117282629013, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.23001186549663544, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.11830934882164001, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.8453248143196106, + "step": 13400 + }, + { + "ce_loss": 0.21656201779842377, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.36059290170669556, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.17768530547618866, + "step": 13400 + }, + { + "epoch": 4.472981987991995, + "loss": 0.8786, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "grad_norm": 2.739274501800537, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "learning_rate": 0.0004870211152071009, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.847709059715271, + "step": 13410 + }, + { + "ce_loss": 0.29653552174568176, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.394599586725235, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.15640832483768463, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.8556306958198547, + "step": 13410 + }, + { + "ce_loss": 0.21378852427005768, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.3763246536254883, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.1716834157705307, + "step": 13410 + }, + { + "epoch": 4.47631754503002, + "loss": 0.8237, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "grad_norm": 2.9815971851348877, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "learning_rate": 0.0004865992836308481, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.7696135640144348, + "step": 13420 + }, + { + "ce_loss": 0.26664626598358154, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.33652380108833313, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.1662791669368744, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.6433257460594177, + "step": 13420 + }, + { + "ce_loss": 0.18302805721759796, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.31425192952156067, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.14584557712078094, + "step": 13420 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.7838, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "grad_norm": 1.3793237209320068, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "learning_rate": 0.0004861773509839127, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.9961182475090027, + "step": 13430 + }, + { + "ce_loss": 0.20061686635017395, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.37714099884033203, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.1583109200000763, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.7867060899734497, + "step": 13430 + }, + { + "ce_loss": 0.20754165947437286, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.3411576747894287, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.15391767024993896, + "step": 13430 + }, + { + "epoch": 4.482988659106071, + "loss": 0.8616, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "grad_norm": 2.627659559249878, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "learning_rate": 0.00048575531775873587, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.7955703139305115, + "step": 13440 + }, + { + "ce_loss": 0.18961289525032043, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.3066923916339874, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.1656840592622757, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.684762716293335, + "step": 13440 + }, + { + "ce_loss": 0.15056568384170532, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.305119127035141, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.12949812412261963, + "step": 13440 + }, + { + "epoch": 4.486324216144096, + "loss": 0.873, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "grad_norm": 1.5340536832809448, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "learning_rate": 0.0004853331844478754, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.805151104927063, + "step": 13450 + }, + { + "ce_loss": 0.11066021025180817, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.3033329248428345, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.14843977987766266, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.811651885509491, + "step": 13450 + }, + { + "ce_loss": 0.19692301750183105, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.34952807426452637, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.17662404477596283, + "step": 13450 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.8853, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "grad_norm": 2.993130683898926, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "learning_rate": 0.00048491095154400653, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 1.1345261335372925, + "step": 13460 + }, + { + "ce_loss": 0.2394268661737442, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.386577844619751, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.190634623169899, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.914226233959198, + "step": 13460 + }, + { + "ce_loss": 0.19585789740085602, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.36293065547943115, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.13437806069850922, + "step": 13460 + }, + { + "epoch": 4.492995330220147, + "loss": 0.8696, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "grad_norm": 1.8465981483459473, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "learning_rate": 0.00048448861953992033, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 1.0066031217575073, + "step": 13470 + }, + { + "ce_loss": 0.23128260672092438, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.37448421120643616, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.1880294680595398, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 0.9604164958000183, + "step": 13470 + }, + { + "ce_loss": 0.21962708234786987, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.3921681344509125, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.15610474348068237, + "step": 13470 + }, + { + "epoch": 4.496330887258172, + "loss": 0.8809, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "grad_norm": 2.4795334339141846, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "learning_rate": 0.0004840661889285238, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.8624405264854431, + "step": 13480 + }, + { + "ce_loss": 0.2435741424560547, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.3684362471103668, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.19913016259670258, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.9379405975341797, + "step": 13480 + }, + { + "ce_loss": 0.17425698041915894, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.31786245107650757, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.14343689382076263, + "step": 13480 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.9711, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "grad_norm": 2.149953603744507, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "learning_rate": 0.0004836436602028389, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.8364251255989075, + "step": 13490 + }, + { + "ce_loss": 0.2063017636537552, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.4093954563140869, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.15085582435131073, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 1.26799738407135, + "step": 13490 + }, + { + "ce_loss": 0.20769761502742767, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.387866348028183, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.18510855734348297, + "step": 13490 + }, + { + "epoch": 4.503002001334223, + "loss": 0.9009, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "grad_norm": 2.5156893730163574, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "learning_rate": 0.0004832210338560022, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.9566195011138916, + "step": 13500 + }, + { + "ce_loss": 0.20787490904331207, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.4229164719581604, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.1770862191915512, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.9364886283874512, + "step": 13500 + }, + { + "ce_loss": 0.276179701089859, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.457122266292572, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.16380095481872559, + "step": 13500 + }, + { + "epoch": 4.506337558372248, + "loss": 0.896, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "grad_norm": 1.6427794694900513, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "learning_rate": 0.0004827983103812638, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.8760179281234741, + "step": 13510 + }, + { + "ce_loss": 0.24923034012317657, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.3755168616771698, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.15355350077152252, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.8987492918968201, + "step": 13510 + }, + { + "ce_loss": 0.29019269347190857, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.3846389651298523, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.18040981888771057, + "step": 13510 + }, + { + "epoch": 4.509673115410274, + "loss": 0.8248, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "grad_norm": 2.027052879333496, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "learning_rate": 0.00048237549027198805, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 1.0097908973693848, + "step": 13520 + }, + { + "ce_loss": 0.2561718821525574, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.3744969069957733, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.23328308761119843, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 0.87522292137146, + "step": 13520 + }, + { + "ce_loss": 0.2116980105638504, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.2912207841873169, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.15882287919521332, + "step": 13520 + }, + { + "epoch": 4.513008672448299, + "loss": 0.8352, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "grad_norm": 1.5604941844940186, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "learning_rate": 0.0004819525740216509, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 0.7068662643432617, + "step": 13530 + }, + { + "ce_loss": 0.1933564394712448, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.35956916213035583, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.15367421507835388, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 1.0363969802856445, + "step": 13530 + }, + { + "ce_loss": 0.16680419445037842, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.35213443636894226, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.16435660421848297, + "step": 13530 + }, + { + "epoch": 4.516344229486324, + "loss": 0.8397, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "grad_norm": 1.7330466508865356, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "learning_rate": 0.0004815295621238415, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.9030655026435852, + "step": 13540 + }, + { + "ce_loss": 0.25999701023101807, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.36837413907051086, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.19690430164337158, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.9909688830375671, + "step": 13540 + }, + { + "ce_loss": 0.24681763350963593, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.38454490900039673, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.1610337644815445, + "step": 13540 + }, + { + "epoch": 4.51967978652435, + "loss": 0.8423, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "grad_norm": 1.7804971933364868, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "learning_rate": 0.0004811064550722602, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.8210447430610657, + "step": 13550 + }, + { + "ce_loss": 0.19143234193325043, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.31518861651420593, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.16232028603553772, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.8883418440818787, + "step": 13550 + }, + { + "ce_loss": 0.2613326609134674, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.3267996907234192, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.1897214651107788, + "step": 13550 + }, + { + "epoch": 4.523015343562375, + "loss": 0.8078, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "grad_norm": 1.4993454217910767, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "learning_rate": 0.00048068325336071845, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.9455620050430298, + "step": 13560 + }, + { + "ce_loss": 0.22340868413448334, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.2784824073314667, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.17123597860336304, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.7702107429504395, + "step": 13560 + }, + { + "ce_loss": 0.22906190156936646, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.32264962792396545, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.17025819420814514, + "step": 13560 + }, + { + "epoch": 4.5263509006004, + "loss": 0.7676, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "grad_norm": 2.0894381999969482, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "learning_rate": 0.0004802599574831381, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.8185257911682129, + "step": 13570 + }, + { + "ce_loss": 0.20196986198425293, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.29363006353378296, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.16779692471027374, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.6812776327133179, + "step": 13570 + }, + { + "ce_loss": 0.11444456875324249, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.3058815598487854, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.13752155005931854, + "step": 13570 + }, + { + "epoch": 4.529686457638426, + "loss": 0.8433, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "grad_norm": 2.1083929538726807, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "learning_rate": 0.000479836567933551, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.8970056772232056, + "step": 13580 + }, + { + "ce_loss": 0.2174151986837387, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.3981429636478424, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.17626845836639404, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.7080292701721191, + "step": 13580 + }, + { + "ce_loss": 0.1591300666332245, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.33692148327827454, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.11072871834039688, + "step": 13580 + }, + { + "epoch": 4.533022014676451, + "loss": 0.808, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "grad_norm": 1.7732107639312744, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "learning_rate": 0.0004794130852060984, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.7357549071311951, + "step": 13590 + }, + { + "ce_loss": 0.22856561839580536, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.3111702501773834, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.15477800369262695, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.842008113861084, + "step": 13590 + }, + { + "ce_loss": 0.22534911334514618, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.40802642703056335, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.20828992128372192, + "step": 13590 + }, + { + "epoch": 4.536357571714476, + "loss": 0.78, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "grad_norm": 1.7454679012298584, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "learning_rate": 0.0004789895097950301, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 0.9143649339675903, + "step": 13600 + }, + { + "ce_loss": 0.2446441650390625, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.35699325799942017, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.16967988014221191, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 1.133493185043335, + "step": 13600 + }, + { + "ce_loss": 0.27196764945983887, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.392011821269989, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.17302842438220978, + "step": 13600 + }, + { + "epoch": 4.539693128752502, + "loss": 0.8807, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "grad_norm": 1.6861317157745361, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "learning_rate": 0.00047856584219470424, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 0.6395084857940674, + "step": 13610 + }, + { + "ce_loss": 0.17107786238193512, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.2538531720638275, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.16082721948623657, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 1.0053611993789673, + "step": 13610 + }, + { + "ce_loss": 0.19790354371070862, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.33340319991111755, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.18687112629413605, + "step": 13610 + }, + { + "epoch": 4.543028685790527, + "loss": 0.8885, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "grad_norm": 2.6688756942749023, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "learning_rate": 0.00047814208289958664, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.7943642139434814, + "step": 13620 + }, + { + "ce_loss": 0.17814122140407562, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.3267526626586914, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.15404582023620605, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.6207186579704285, + "step": 13620 + }, + { + "ce_loss": 0.1744394153356552, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.266560435295105, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.12551294267177582, + "step": 13620 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.7633, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "grad_norm": 1.8141019344329834, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "learning_rate": 0.0004777182324042497, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.6916319131851196, + "step": 13630 + }, + { + "ce_loss": 0.16996467113494873, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.3146441578865051, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.17255429923534393, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.7187036275863647, + "step": 13630 + }, + { + "ce_loss": 0.2080836296081543, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.2987633943557739, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.16911211609840393, + "step": 13630 + }, + { + "epoch": 4.549699799866578, + "loss": 0.8673, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "grad_norm": 1.6610087156295776, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "learning_rate": 0.00047729429120337284, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.7459085583686829, + "step": 13640 + }, + { + "ce_loss": 0.19113396108150482, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.2890676259994507, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.16471309959888458, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.7211428284645081, + "step": 13640 + }, + { + "ce_loss": 0.22979314625263214, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.3184564709663391, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.17257662117481232, + "step": 13640 + }, + { + "epoch": 4.553035356904603, + "loss": 0.8389, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "grad_norm": 1.836702585220337, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "learning_rate": 0.00047687025979174086, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.8416532278060913, + "step": 13650 + }, + { + "ce_loss": 0.19439411163330078, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.31022632122039795, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.19999051094055176, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.9537258148193359, + "step": 13650 + }, + { + "ce_loss": 0.29107460379600525, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.4233335256576538, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.18829721212387085, + "step": 13650 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.8613, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "grad_norm": 1.539566993713379, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "learning_rate": 0.00047644613866424415, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 1.090315580368042, + "step": 13660 + }, + { + "ce_loss": 0.2049546092748642, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.35115835070610046, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.21695077419281006, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 1.1450974941253662, + "step": 13660 + }, + { + "ce_loss": 0.18329203128814697, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.3362187445163727, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.13686105608940125, + "step": 13660 + }, + { + "epoch": 4.559706470980654, + "loss": 0.9193, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "grad_norm": 2.8135159015655518, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "learning_rate": 0.0004760219283158776, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.6657029390335083, + "step": 13670 + }, + { + "ce_loss": 0.1496344953775406, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.351999968290329, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.12632089853286743, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.9592874050140381, + "step": 13670 + }, + { + "ce_loss": 0.17044517397880554, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.3963066637516022, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.16819757223129272, + "step": 13670 + }, + { + "epoch": 4.563042028018679, + "loss": 0.8466, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "grad_norm": 3.001901388168335, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "learning_rate": 0.00047559762924174055, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.9727820158004761, + "step": 13680 + }, + { + "ce_loss": 0.22695204615592957, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.3284171521663666, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.18167749047279358, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.9198602437973022, + "step": 13680 + }, + { + "ce_loss": 0.21199175715446472, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.4086395800113678, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.16664999723434448, + "step": 13680 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.9568, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "grad_norm": 2.173248052597046, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "learning_rate": 0.0004751732419370354, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.9737968444824219, + "step": 13690 + }, + { + "ce_loss": 0.28544288873672485, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.44596606492996216, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.1873977780342102, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.8217427134513855, + "step": 13690 + }, + { + "ce_loss": 0.21305979788303375, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.3435061573982239, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.14034755527973175, + "step": 13690 + }, + { + "epoch": 4.56971314209473, + "loss": 0.8467, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "grad_norm": 1.6340337991714478, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "learning_rate": 0.00047474876689706814, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.8798074722290039, + "step": 13700 + }, + { + "ce_loss": 0.19881856441497803, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.3740568161010742, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.1589992195367813, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.80402672290802, + "step": 13700 + }, + { + "ce_loss": 0.17962594330310822, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.30576291680336, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.15402908623218536, + "step": 13700 + }, + { + "epoch": 4.573048699132755, + "loss": 0.8932, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "grad_norm": 1.817192554473877, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "learning_rate": 0.00047432420461724636, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.8386541604995728, + "step": 13710 + }, + { + "ce_loss": 0.29571789503097534, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.31294816732406616, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.21605390310287476, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.8117653727531433, + "step": 13710 + }, + { + "ce_loss": 0.2589631974697113, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.3590703010559082, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.16542576253414154, + "step": 13710 + }, + { + "epoch": 4.576384256170781, + "loss": 0.8229, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "grad_norm": 1.6762044429779053, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "learning_rate": 0.0004738995555930803, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 0.7958192229270935, + "step": 13720 + }, + { + "ce_loss": 0.23379138112068176, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.3523423969745636, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.14432045817375183, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 1.1285977363586426, + "step": 13720 + }, + { + "ce_loss": 0.20815251767635345, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.33814939856529236, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.18970946967601776, + "step": 13720 + }, + { + "epoch": 4.579719813208806, + "loss": 0.8069, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "grad_norm": 2.443239212036133, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "learning_rate": 0.0004734748203201809, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.7413336634635925, + "step": 13730 + }, + { + "ce_loss": 0.1772276610136032, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.3523496091365814, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.1505611091852188, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.8777781128883362, + "step": 13730 + }, + { + "ce_loss": 0.2584392726421356, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.371855229139328, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.19492730498313904, + "step": 13730 + }, + { + "epoch": 4.583055370246831, + "loss": 0.9522, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "grad_norm": 1.348532795906067, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "learning_rate": 0.00047304999929426004, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.9521989226341248, + "step": 13740 + }, + { + "ce_loss": 0.2450057864189148, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.4274292588233948, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.20220626890659332, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.9642909169197083, + "step": 13740 + }, + { + "ce_loss": 0.18685844540596008, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.3277007043361664, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.13163049519062042, + "step": 13740 + }, + { + "epoch": 4.586390927284857, + "loss": 0.983, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "grad_norm": 2.962904453277588, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "learning_rate": 0.0004726250930111295, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.7624634504318237, + "step": 13750 + }, + { + "ce_loss": 0.18860645592212677, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.33328020572662354, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.131907120347023, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.7788711786270142, + "step": 13750 + }, + { + "ce_loss": 0.19581912457942963, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.36439990997314453, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.11892513185739517, + "step": 13750 + }, + { + "epoch": 4.589726484322882, + "loss": 0.8685, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "grad_norm": 1.640178918838501, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "learning_rate": 0.0004722001019667006, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.6929627060890198, + "step": 13760 + }, + { + "ce_loss": 0.17001688480377197, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.3402334451675415, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.1825055181980133, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.9117600321769714, + "step": 13760 + }, + { + "ce_loss": 0.22071193158626556, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.46392345428466797, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.2267303317785263, + "step": 13760 + }, + { + "epoch": 4.593062041360907, + "loss": 0.8807, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "grad_norm": 3.823497772216797, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "learning_rate": 0.00047177502665698355, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.8650383353233337, + "step": 13770 + }, + { + "ce_loss": 0.17190320789813995, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.26395130157470703, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.18300709128379822, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.7681661248207092, + "step": 13770 + }, + { + "ce_loss": 0.23117798566818237, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.36607617139816284, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.17063121497631073, + "step": 13770 + }, + { + "epoch": 4.596397598398933, + "loss": 0.8961, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "grad_norm": 2.5398452281951904, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "learning_rate": 0.0004713498675780871, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 1.2317657470703125, + "step": 13780 + }, + { + "ce_loss": 0.21036627888679504, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.37952694296836853, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.17117281258106232, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 0.9808338284492493, + "step": 13780 + }, + { + "ce_loss": 0.16275614500045776, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.3171769082546234, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.155286967754364, + "step": 13780 + }, + { + "epoch": 4.599733155436958, + "loss": 0.8273, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "grad_norm": 2.3886170387268066, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "learning_rate": 0.0004709246252262178, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.9716641902923584, + "step": 13790 + }, + { + "ce_loss": 0.2618649899959564, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.4601925313472748, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.205724835395813, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.7593345642089844, + "step": 13790 + }, + { + "ce_loss": 0.1982172429561615, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.35267505049705505, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.15899710357189178, + "step": 13790 + }, + { + "epoch": 4.603068712474983, + "loss": 0.9017, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "grad_norm": 1.610485315322876, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "learning_rate": 0.00047049930009767884, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 0.6228737235069275, + "step": 13800 + }, + { + "ce_loss": 0.1588118076324463, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.22418899834156036, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.13548356294631958, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 1.1625922918319702, + "step": 13800 + }, + { + "ce_loss": 0.25314319133758545, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.3250918984413147, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.22533030807971954, + "step": 13800 + }, + { + "epoch": 4.606404269513009, + "loss": 0.8051, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "grad_norm": 1.8878289461135864, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "learning_rate": 0.00047007389268887085, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 1.0144697427749634, + "step": 13810 + }, + { + "ce_loss": 0.258879691362381, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.35777097940444946, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.14117321372032166, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 0.8026508688926697, + "step": 13810 + }, + { + "ce_loss": 0.252027302980423, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.354375422000885, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.14750243723392487, + "step": 13810 + }, + { + "epoch": 4.609739826551034, + "loss": 0.8519, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "grad_norm": 2.351003646850586, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "learning_rate": 0.0004696484034962896, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.7515314221382141, + "step": 13820 + }, + { + "ce_loss": 0.20121333003044128, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.30150169134140015, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.19240356981754303, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.5818171501159668, + "step": 13820 + }, + { + "ce_loss": 0.12394590675830841, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.3400002419948578, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.11776699125766754, + "step": 13820 + }, + { + "epoch": 4.613075383589059, + "loss": 0.7783, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "grad_norm": 1.6915327310562134, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "learning_rate": 0.00046922283301652716, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.8585334420204163, + "step": 13830 + }, + { + "ce_loss": 0.2418600618839264, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.40377673506736755, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.17398446798324585, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.8019068241119385, + "step": 13830 + }, + { + "ce_loss": 0.18170292675495148, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.3564358353614807, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.1549520045518875, + "step": 13830 + }, + { + "epoch": 4.616410940627085, + "loss": 0.9287, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "grad_norm": 1.9859334230422974, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "learning_rate": 0.0004687971817462698, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.5590543150901794, + "step": 13840 + }, + { + "ce_loss": 0.14512279629707336, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.2831781208515167, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.13064338266849518, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.8300645351409912, + "step": 13840 + }, + { + "ce_loss": 0.18325687944889069, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.331007182598114, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.17050574719905853, + "step": 13840 + }, + { + "epoch": 4.61974649766511, + "loss": 0.8127, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "grad_norm": 1.4375276565551758, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "learning_rate": 0.00046837145018229854, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.8129776120185852, + "step": 13850 + }, + { + "ce_loss": 0.16128899157047272, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.4503794014453888, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.18895433843135834, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.7150787115097046, + "step": 13850 + }, + { + "ce_loss": 0.1600542962551117, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.38444283604621887, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.1291278898715973, + "step": 13850 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.9294, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "grad_norm": 1.7123420238494873, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "learning_rate": 0.0004679456388214877, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 1.0135622024536133, + "step": 13860 + }, + { + "ce_loss": 0.1946658492088318, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.4028632938861847, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.16513720154762268, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.6947537660598755, + "step": 13860 + }, + { + "ce_loss": 0.17143045365810394, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.3919539451599121, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.13105444610118866, + "step": 13860 + }, + { + "epoch": 4.626417611741161, + "loss": 0.8752, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "grad_norm": 2.3874919414520264, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "learning_rate": 0.0004675197481608054, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.9981850385665894, + "step": 13870 + }, + { + "ce_loss": 0.2395416498184204, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.35958918929100037, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.14545199275016785, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.546391487121582, + "step": 13870 + }, + { + "ce_loss": 0.14458729326725006, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.2774547338485718, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.1227763444185257, + "step": 13870 + }, + { + "epoch": 4.629753168779186, + "loss": 0.8994, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "grad_norm": 3.77695894241333, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "learning_rate": 0.0004670937786973112, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.9475411176681519, + "step": 13880 + }, + { + "ce_loss": 0.2959573268890381, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.3495613932609558, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.1935998499393463, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.7587097883224487, + "step": 13880 + }, + { + "ce_loss": 0.23118411004543304, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.34271347522735596, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.1349596381187439, + "step": 13880 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.8409, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "grad_norm": 1.8643723726272583, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "learning_rate": 0.00046666773092815793, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.858877420425415, + "step": 13890 + }, + { + "ce_loss": 0.19074980914592743, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.40514233708381653, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.14177602529525757, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.6139600872993469, + "step": 13890 + }, + { + "ce_loss": 0.15172399580478668, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.27114883065223694, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.15620894730091095, + "step": 13890 + }, + { + "epoch": 4.636424282855237, + "loss": 0.8575, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "grad_norm": 1.9550460577011108, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "learning_rate": 0.0004662416053505888, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 1.3733259439468384, + "step": 13900 + }, + { + "ce_loss": 0.2609739303588867, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.39709389209747314, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.19748690724372864, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 0.9573059678077698, + "step": 13900 + }, + { + "ce_loss": 0.2724902927875519, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.3900492191314697, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.1998416632413864, + "step": 13900 + }, + { + "epoch": 4.639759839893262, + "loss": 0.8736, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "grad_norm": 3.21980357170105, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "learning_rate": 0.00046581540246193846, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.7142580151557922, + "step": 13910 + }, + { + "ce_loss": 0.20784704387187958, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.35620594024658203, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.14930930733680725, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.96323561668396, + "step": 13910 + }, + { + "ce_loss": 0.25436338782310486, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.3177351951599121, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.1559751331806183, + "step": 13910 + }, + { + "epoch": 4.643095396931288, + "loss": 0.8428, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "grad_norm": 2.507951259613037, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "learning_rate": 0.0004653891227596313, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.7738920450210571, + "step": 13920 + }, + { + "ce_loss": 0.1673024594783783, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.32107099890708923, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.15293675661087036, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.831015944480896, + "step": 13920 + }, + { + "ce_loss": 0.2569162845611572, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.31806325912475586, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.20909073948860168, + "step": 13920 + }, + { + "epoch": 4.646430953969313, + "loss": 0.845, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "grad_norm": 1.8366520404815674, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "learning_rate": 0.00046496276674118175, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.7713529467582703, + "step": 13930 + }, + { + "ce_loss": 0.2573528587818146, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.31989189982414246, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.15530826151371002, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.7456807494163513, + "step": 13930 + }, + { + "ce_loss": 0.2011760026216507, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.2997419536113739, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.14415857195854187, + "step": 13930 + }, + { + "epoch": 4.649766511007338, + "loss": 0.792, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "grad_norm": 4.883190631866455, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "learning_rate": 0.000464536334904193, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.7827562093734741, + "step": 13940 + }, + { + "ce_loss": 0.20156757533550262, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.36345911026000977, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.09804141521453857, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.8018506169319153, + "step": 13940 + }, + { + "ce_loss": 0.2580086886882782, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.291150838136673, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.20864073932170868, + "step": 13940 + }, + { + "epoch": 4.653102068045364, + "loss": 0.8359, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "grad_norm": 1.4981822967529297, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "learning_rate": 0.0004641098277463573, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.8859256505966187, + "step": 13950 + }, + { + "ce_loss": 0.14007827639579773, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.2375439703464508, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.14340248703956604, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.9954640865325928, + "step": 13950 + }, + { + "ce_loss": 0.22668717801570892, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.3906228542327881, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.14904038608074188, + "step": 13950 + }, + { + "epoch": 4.656437625083389, + "loss": 0.8711, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "grad_norm": 2.03786039352417, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "learning_rate": 0.00046368324576545394, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 0.891573429107666, + "step": 13960 + }, + { + "ce_loss": 0.20240835845470428, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.2832994759082794, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.18917511403560638, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 1.0462068319320679, + "step": 13960 + }, + { + "ce_loss": 0.29488521814346313, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.4123867154121399, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.1961653083562851, + "step": 13960 + }, + { + "epoch": 4.659773182121414, + "loss": 0.8192, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "grad_norm": 1.4951426982879639, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "learning_rate": 0.0004632565894593502, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.8212860822677612, + "step": 13970 + }, + { + "ce_loss": 0.21576517820358276, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.31486424803733826, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.16315826773643494, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.5121589303016663, + "step": 13970 + }, + { + "ce_loss": 0.13231956958770752, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.21873025596141815, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.15629905462265015, + "step": 13970 + }, + { + "epoch": 4.66310873915944, + "loss": 0.8827, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "grad_norm": 1.7232520580291748, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "learning_rate": 0.0004628298593259999, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.7036920785903931, + "step": 13980 + }, + { + "ce_loss": 0.15738870203495026, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.36597204208374023, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.14112482964992523, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.6786432266235352, + "step": 13980 + }, + { + "ce_loss": 0.17833980917930603, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.3547556698322296, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.14487062394618988, + "step": 13980 + }, + { + "epoch": 4.666444296197465, + "loss": 0.8998, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "grad_norm": 2.2571568489074707, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "learning_rate": 0.0004624030558634429, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.7320971488952637, + "step": 13990 + }, + { + "ce_loss": 0.20538297295570374, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.29730069637298584, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.18843059241771698, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.8136546015739441, + "step": 13990 + }, + { + "ce_loss": 0.2071070671081543, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.3963843286037445, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.20945298671722412, + "step": 13990 + }, + { + "epoch": 4.66977985323549, + "loss": 0.8314, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "grad_norm": 1.6027140617370605, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "learning_rate": 0.00046197617956980505, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.904109001159668, + "step": 14000 + }, + { + "ce_loss": 0.2536281943321228, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.39303573966026306, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.15961523354053497, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.8013550043106079, + "step": 14000 + }, + { + "ce_loss": 0.21332262456417084, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.33780455589294434, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.14180517196655273, + "step": 14000 + }, + { + "epoch": 4.673115410273516, + "loss": 0.8655, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "grad_norm": 1.8766305446624756, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "learning_rate": 0.00046154923094329656, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 1.1529908180236816, + "step": 14010 + }, + { + "ce_loss": 0.18356235325336456, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.3325393795967102, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.13770374655723572, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 0.7322327494621277, + "step": 14010 + }, + { + "ce_loss": 0.18772105872631073, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.365714967250824, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.17847004532814026, + "step": 14010 + }, + { + "epoch": 4.676450967311541, + "loss": 0.8874, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "grad_norm": 2.3067870140075684, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "learning_rate": 0.00046112221048221267, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.5756610035896301, + "step": 14020 + }, + { + "ce_loss": 0.1752990335226059, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.26576200127601624, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.1171606183052063, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.9301656484603882, + "step": 14020 + }, + { + "ce_loss": 0.2776738107204437, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.2842113971710205, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.20117861032485962, + "step": 14020 + }, + { + "epoch": 4.679786524349566, + "loss": 0.8123, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "grad_norm": 2.325228214263916, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "learning_rate": 0.00046069511868493206, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.8015709519386292, + "step": 14030 + }, + { + "ce_loss": 0.20850978791713715, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.34072285890579224, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.17537088692188263, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.682460367679596, + "step": 14030 + }, + { + "ce_loss": 0.14392144978046417, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.3384716808795929, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.13397617638111115, + "step": 14030 + }, + { + "epoch": 4.683122081387592, + "loss": 0.8589, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "grad_norm": 2.0175516605377197, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "learning_rate": 0.00046026795604991685, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.7632008790969849, + "step": 14040 + }, + { + "ce_loss": 0.22315922379493713, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.3759915232658386, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.16336357593536377, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.9702370166778564, + "step": 14040 + }, + { + "ce_loss": 0.30179935693740845, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.41837430000305176, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.20655812323093414, + "step": 14040 + }, + { + "epoch": 4.686457638425617, + "loss": 0.8679, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "grad_norm": 1.6562148332595825, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "learning_rate": 0.00045984072307571187, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 0.7191571593284607, + "step": 14050 + }, + { + "ce_loss": 0.16404734551906586, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.3208382725715637, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.11867458373308182, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 1.045770525932312, + "step": 14050 + }, + { + "ce_loss": 0.23788468539714813, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.41924071311950684, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.18163149058818817, + "step": 14050 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.8991, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "grad_norm": 2.0935001373291016, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "learning_rate": 0.000459413420260944, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 1.0229648351669312, + "step": 14060 + }, + { + "ce_loss": 0.22401516139507294, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.38364315032958984, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.2235155701637268, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 1.130496621131897, + "step": 14060 + }, + { + "ce_loss": 0.25606226921081543, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.359218567609787, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.167112797498703, + "step": 14060 + }, + { + "epoch": 4.693128752501668, + "loss": 0.8388, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "grad_norm": 1.860084056854248, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "learning_rate": 0.0004589860481043215, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 0.8229884505271912, + "step": 14070 + }, + { + "ce_loss": 0.25853830575942993, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.35701021552085876, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.20571230351924896, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 1.2947938442230225, + "step": 14070 + }, + { + "ce_loss": 0.2600174844264984, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.4015241265296936, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.22245274484157562, + "step": 14070 + }, + { + "epoch": 4.696464309539693, + "loss": 0.8792, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "grad_norm": 2.23905611038208, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "learning_rate": 0.00045855860710463373, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.863398015499115, + "step": 14080 + }, + { + "ce_loss": 0.19016389548778534, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.36668360233306885, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.17019394040107727, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.660220742225647, + "step": 14080 + }, + { + "ce_loss": 0.14625494182109833, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.37596479058265686, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.11250553280115128, + "step": 14080 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.81, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "grad_norm": 1.3756694793701172, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "learning_rate": 0.0004581310977607502, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.7182285785675049, + "step": 14090 + }, + { + "ce_loss": 0.16865786910057068, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.28787916898727417, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.1471586972475052, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.7447019219398499, + "step": 14090 + }, + { + "ce_loss": 0.20779182016849518, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.3117561638355255, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.1663566380739212, + "step": 14090 + }, + { + "epoch": 4.703135423615744, + "loss": 0.8146, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "grad_norm": 2.052716016769409, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "learning_rate": 0.0004577035205716205, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 0.610322117805481, + "step": 14100 + }, + { + "ce_loss": 0.18214602768421173, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.2763131856918335, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.15158452093601227, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 1.021213173866272, + "step": 14100 + }, + { + "ce_loss": 0.18335743248462677, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.3717056214809418, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.13745620846748352, + "step": 14100 + }, + { + "epoch": 4.706470980653769, + "loss": 0.8365, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "grad_norm": 2.3778693675994873, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "learning_rate": 0.0004572758760362731, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.8436442017555237, + "step": 14110 + }, + { + "ce_loss": 0.23669615387916565, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.3541877269744873, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.13591575622558594, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.8931856155395508, + "step": 14110 + }, + { + "ce_loss": 0.1591729372739792, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.3652481436729431, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.11494230479001999, + "step": 14110 + }, + { + "epoch": 4.709806537691795, + "loss": 0.9286, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "grad_norm": 1.803930401802063, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "learning_rate": 0.00045684816465381525, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.5938913226127625, + "step": 14120 + }, + { + "ce_loss": 0.19339902698993683, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.26114407181739807, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.13884493708610535, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.7062050104141235, + "step": 14120 + }, + { + "ce_loss": 0.15822601318359375, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.32899022102355957, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.11663496494293213, + "step": 14120 + }, + { + "epoch": 4.71314209472982, + "loss": 0.8309, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "grad_norm": 2.1349706649780273, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "learning_rate": 0.0004564203869234321, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 0.9979802370071411, + "step": 14130 + }, + { + "ce_loss": 0.26847484707832336, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.39733004570007324, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.17914006114006042, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 0.7394533157348633, + "step": 14130 + }, + { + "ce_loss": 0.1996060609817505, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.34057703614234924, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.15275272727012634, + "step": 14130 + }, + { + "epoch": 4.716477651767845, + "loss": 0.8551, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "grad_norm": 1.858211874961853, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "learning_rate": 0.0004559925433443864, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.9450689554214478, + "step": 14140 + }, + { + "ce_loss": 0.22283834218978882, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.32096511125564575, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.13867157697677612, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.8570036888122559, + "step": 14140 + }, + { + "ce_loss": 0.2400035560131073, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.3585464060306549, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.1861157864332199, + "step": 14140 + }, + { + "epoch": 4.719813208805871, + "loss": 0.7722, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "grad_norm": 3.83005952835083, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "learning_rate": 0.0004555646344160174, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.7848604321479797, + "step": 14150 + }, + { + "ce_loss": 0.22449667751789093, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.35835111141204834, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.15118904411792755, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.7087922096252441, + "step": 14150 + }, + { + "ce_loss": 0.1851268857717514, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.3466935455799103, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.14285339415073395, + "step": 14150 + }, + { + "epoch": 4.723148765843896, + "loss": 0.8637, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "grad_norm": 2.271299123764038, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "learning_rate": 0.0004551366606377412, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 1.5590428113937378, + "step": 14160 + }, + { + "ce_loss": 0.26412802934646606, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.35552310943603516, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.19402185082435608, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 0.8642193078994751, + "step": 14160 + }, + { + "ce_loss": 0.22637015581130981, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.2978854179382324, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.20083940029144287, + "step": 14160 + }, + { + "epoch": 4.726484322881921, + "loss": 0.892, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "grad_norm": 1.4375994205474854, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "learning_rate": 0.00045470862250904904, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.8493502140045166, + "step": 14170 + }, + { + "ce_loss": 0.1954825073480606, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.31222012639045715, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.2064986526966095, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.8059635758399963, + "step": 14170 + }, + { + "ce_loss": 0.20247094333171844, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.30323803424835205, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.1978103071451187, + "step": 14170 + }, + { + "epoch": 4.729819879919947, + "loss": 0.8039, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "grad_norm": 1.7913191318511963, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "learning_rate": 0.00045428052052950757, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 1.217488408088684, + "step": 14180 + }, + { + "ce_loss": 0.28452906012535095, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.4151560366153717, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.20734523236751556, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 0.6979823112487793, + "step": 14180 + }, + { + "ce_loss": 0.1885191649198532, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.32918426394462585, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.1795302927494049, + "step": 14180 + }, + { + "epoch": 4.733155436957972, + "loss": 0.8244, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "grad_norm": 1.3256746530532837, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "learning_rate": 0.00045385235519875775, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.887203574180603, + "step": 14190 + }, + { + "ce_loss": 0.21348625421524048, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.3612407445907593, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.20668892562389374, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.7140733599662781, + "step": 14190 + }, + { + "ce_loss": 0.2096211165189743, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.3249552845954895, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.1359350085258484, + "step": 14190 + }, + { + "epoch": 4.736490993995997, + "loss": 0.838, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "grad_norm": 2.408013105392456, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "learning_rate": 0.0004534241270165147, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.648318350315094, + "step": 14200 + }, + { + "ce_loss": 0.1675240397453308, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.3086930513381958, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.13107258081436157, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.633400559425354, + "step": 14200 + }, + { + "ce_loss": 0.11860974133014679, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.2925090491771698, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.10717403143644333, + "step": 14200 + }, + { + "epoch": 4.739826551034023, + "loss": 0.8424, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "grad_norm": 1.4014778137207031, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "learning_rate": 0.0004529958364825666, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.7830230593681335, + "step": 14210 + }, + { + "ce_loss": 0.1478165090084076, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.25908103585243225, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.14894166588783264, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.7763957977294922, + "step": 14210 + }, + { + "ce_loss": 0.15796737372875214, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.3048602342605591, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.1621614545583725, + "step": 14210 + }, + { + "epoch": 4.743162108072048, + "loss": 0.9225, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "grad_norm": 2.1950395107269287, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "learning_rate": 0.00045256748409677495, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 1.2173900604248047, + "step": 14220 + }, + { + "ce_loss": 0.23952677845954895, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.3450341820716858, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.2102057933807373, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 1.120995044708252, + "step": 14220 + }, + { + "ce_loss": 0.15178608894348145, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.3196716606616974, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.13504192233085632, + "step": 14220 + }, + { + "epoch": 4.746497665110073, + "loss": 0.8293, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "grad_norm": 7.004584789276123, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "learning_rate": 0.00045213907035907274, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.7378753423690796, + "step": 14230 + }, + { + "ce_loss": 0.20550483465194702, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.34348252415657043, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.1843615621328354, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.7565779089927673, + "step": 14230 + }, + { + "ce_loss": 0.19548478722572327, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.38570237159729004, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.17464320361614227, + "step": 14230 + }, + { + "epoch": 4.749833222148099, + "loss": 0.8394, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "grad_norm": 1.8284156322479248, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "learning_rate": 0.0004517105957694652, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 0.7750556468963623, + "step": 14240 + }, + { + "ce_loss": 0.19875389337539673, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.34265342354774475, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.16989080607891083, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 1.022950530052185, + "step": 14240 + }, + { + "ce_loss": 0.22671598196029663, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.3508990406990051, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.16097328066825867, + "step": 14240 + }, + { + "epoch": 4.753168779186124, + "loss": 0.8546, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "grad_norm": 1.2136383056640625, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "learning_rate": 0.00045128206082802784, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 0.8702363967895508, + "step": 14250 + }, + { + "ce_loss": 0.1765323132276535, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.36349090933799744, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.1928870975971222, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 1.055302619934082, + "step": 14250 + }, + { + "ce_loss": 0.2146630883216858, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.38636380434036255, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.142844557762146, + "step": 14250 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.8101, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "grad_norm": 1.6025526523590088, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "learning_rate": 0.0004508534660349074, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.8958355188369751, + "step": 14260 + }, + { + "ce_loss": 0.243308424949646, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.3873001039028168, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.16264839470386505, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.8794652223587036, + "step": 14260 + }, + { + "ce_loss": 0.1975322812795639, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.38459375500679016, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.17643694579601288, + "step": 14260 + }, + { + "epoch": 4.759839893262175, + "loss": 0.8606, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "grad_norm": 2.014094114303589, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "learning_rate": 0.00045042481189032016, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.7878975868225098, + "step": 14270 + }, + { + "ce_loss": 0.2223137468099594, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.34276866912841797, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.1719190776348114, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.7920337319374084, + "step": 14270 + }, + { + "ce_loss": 0.19568029046058655, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.3749501705169678, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.158006951212883, + "step": 14270 + }, + { + "epoch": 4.7631754503002, + "loss": 0.8574, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "grad_norm": 2.1007559299468994, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "learning_rate": 0.0004499960988945514, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.5976972579956055, + "step": 14280 + }, + { + "ce_loss": 0.17164160311222076, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.29972830414772034, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.12624342739582062, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.6230519413948059, + "step": 14280 + }, + { + "ce_loss": 0.17910799384117126, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.27381521463394165, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.1337912231683731, + "step": 14280 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.7983, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "grad_norm": 1.765936255455017, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "learning_rate": 0.0004495673275479554, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.7464971542358398, + "step": 14290 + }, + { + "ce_loss": 0.2413792610168457, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.320978581905365, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.18388238549232483, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.7573192119598389, + "step": 14290 + }, + { + "ce_loss": 0.24678291380405426, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.29336345195770264, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.1423608660697937, + "step": 14290 + }, + { + "epoch": 4.769846564376251, + "loss": 0.8084, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "grad_norm": 1.6711090803146362, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "learning_rate": 0.0004491384983509546, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.7083107829093933, + "step": 14300 + }, + { + "ce_loss": 0.21236170828342438, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.2792416214942932, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.15519611537456512, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.7527049779891968, + "step": 14300 + }, + { + "ce_loss": 0.22505830228328705, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.312044620513916, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.1796731948852539, + "step": 14300 + }, + { + "epoch": 4.773182121414276, + "loss": 0.8198, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "grad_norm": 1.9802237749099731, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "learning_rate": 0.0004487096118040387, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 1.025206208229065, + "step": 14310 + }, + { + "ce_loss": 0.27857324481010437, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.4218524396419525, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.1912098228931427, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 0.8435007929801941, + "step": 14310 + }, + { + "ce_loss": 0.2689407467842102, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.3354867100715637, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.19938433170318604, + "step": 14310 + }, + { + "epoch": 4.776517678452302, + "loss": 0.8582, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "grad_norm": 1.2547904253005981, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "learning_rate": 0.00044828066840776426, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.6429987549781799, + "step": 14320 + }, + { + "ce_loss": 0.1616217941045761, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.30774685740470886, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.17335504293441772, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.8333144783973694, + "step": 14320 + }, + { + "ce_loss": 0.1563624143600464, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.3343176245689392, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.15886586904525757, + "step": 14320 + }, + { + "epoch": 4.779853235490327, + "loss": 0.8333, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "grad_norm": 2.450331926345825, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "learning_rate": 0.00044785166866275486, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.7152910828590393, + "step": 14330 + }, + { + "ce_loss": 0.1755499690771103, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.3227419853210449, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.13227547705173492, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.7708675265312195, + "step": 14330 + }, + { + "ce_loss": 0.254288911819458, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.32725656032562256, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.1537141501903534, + "step": 14330 + }, + { + "epoch": 4.783188792528352, + "loss": 0.7857, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "grad_norm": 2.2568359375, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "learning_rate": 0.0004474226130696989, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 0.6318008303642273, + "step": 14340 + }, + { + "ce_loss": 0.1493198722600937, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.3203362822532654, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.16190843284130096, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 1.5504295825958252, + "step": 14340 + }, + { + "ce_loss": 0.25872090458869934, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.3019382655620575, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.16322508454322815, + "step": 14340 + }, + { + "epoch": 4.786524349566378, + "loss": 0.8551, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "grad_norm": 3.3923728466033936, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "learning_rate": 0.0004469935021293507, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.6944651007652283, + "step": 14350 + }, + { + "ce_loss": 0.15247030556201935, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.2727074921131134, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.1375962793827057, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.815066933631897, + "step": 14350 + }, + { + "ce_loss": 0.21010121703147888, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.3797869384288788, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.17980359494686127, + "step": 14350 + }, + { + "epoch": 4.789859906604403, + "loss": 0.8532, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "grad_norm": 1.3975764513015747, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "learning_rate": 0.00044656433634252863, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.7048535943031311, + "step": 14360 + }, + { + "ce_loss": 0.18724028766155243, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.30819523334503174, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.1732262521982193, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.9230031967163086, + "step": 14360 + }, + { + "ce_loss": 0.2237575799226761, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.3416767120361328, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.21102052927017212, + "step": 14360 + }, + { + "epoch": 4.793195463642428, + "loss": 0.8148, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "grad_norm": 1.9771432876586914, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "learning_rate": 0.00044613511621011565, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.7248369455337524, + "step": 14370 + }, + { + "ce_loss": 0.20936031639575958, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.3045058846473694, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.17129839956760406, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.9662963151931763, + "step": 14370 + }, + { + "ce_loss": 0.2426602840423584, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.3995358943939209, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.15862718224525452, + "step": 14370 + }, + { + "epoch": 4.796531020680454, + "loss": 0.8521, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "grad_norm": 2.1875810623168945, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "learning_rate": 0.00044570584223305767, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.7186112999916077, + "step": 14380 + }, + { + "ce_loss": 0.21039500832557678, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.2966489791870117, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.1435556709766388, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.9201071262359619, + "step": 14380 + }, + { + "ce_loss": 0.19870568811893463, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.2828711271286011, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.1364685297012329, + "step": 14380 + }, + { + "epoch": 4.799866577718479, + "loss": 1.2347, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "grad_norm": 2.3982861042022705, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "learning_rate": 0.00044527651491236376, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.7846877574920654, + "step": 14390 + }, + { + "ce_loss": 0.2630446255207062, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.3383769989013672, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.18286670744419098, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.7356339693069458, + "step": 14390 + }, + { + "ce_loss": 0.20563283562660217, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.3828813433647156, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.1468716263771057, + "step": 14390 + }, + { + "epoch": 4.803202134756504, + "loss": 0.8352, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "grad_norm": 1.5926655530929565, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "learning_rate": 0.00044484713474910484, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.987956166267395, + "step": 14400 + }, + { + "ce_loss": 0.20529904961585999, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.36365723609924316, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.1763196885585785, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.8181911110877991, + "step": 14400 + }, + { + "ce_loss": 0.18043328821659088, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.39718514680862427, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.17551366984844208, + "step": 14400 + }, + { + "epoch": 4.80653769179453, + "loss": 0.815, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "grad_norm": 2.538222074508667, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "learning_rate": 0.000444417702244414, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 1.3166474103927612, + "step": 14410 + }, + { + "ce_loss": 0.18348243832588196, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.29405656456947327, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.16508980095386505, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 0.853499174118042, + "step": 14410 + }, + { + "ce_loss": 0.22681495547294617, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.3703365921974182, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.15008428692817688, + "step": 14410 + }, + { + "epoch": 4.809873248832555, + "loss": 1.1308, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "grad_norm": 76.6787338256836, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "learning_rate": 0.0004439882178994851, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 1.8646230697631836, + "step": 14420 + }, + { + "ce_loss": 1.0506770610809326, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.09022989124059677, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.6693422198295593, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 1.9133636951446533, + "step": 14420 + }, + { + "ce_loss": 1.0093390941619873, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.11060499399900436, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.642074465751648, + "step": 14420 + }, + { + "epoch": 4.81320880587058, + "loss": 1.5584, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "grad_norm": 2.6255245208740234, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "learning_rate": 0.0004435586822155725, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 1.3784888982772827, + "step": 14430 + }, + { + "ce_loss": 0.8630189895629883, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.06988421082496643, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.44548124074935913, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 1.231710433959961, + "step": 14430 + }, + { + "ce_loss": 0.7029614448547363, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.06970061361789703, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.4138675034046173, + "step": 14430 + }, + { + "epoch": 4.816544362908606, + "loss": 1.3034, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "grad_norm": 1.8560012578964233, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "learning_rate": 0.00044312909569399066, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 1.188285231590271, + "step": 14440 + }, + { + "ce_loss": 0.7114423513412476, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.10720157623291016, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.3693326413631439, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 1.2714197635650635, + "step": 14440 + }, + { + "ce_loss": 0.717993438243866, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.06978762149810791, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.3870980143547058, + "step": 14440 + }, + { + "epoch": 4.819879919946631, + "loss": 1.0291, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "grad_norm": 2.1931910514831543, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "learning_rate": 0.0004426994588361134, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 1.0039728879928589, + "step": 14450 + }, + { + "ce_loss": 0.2299180030822754, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.329211950302124, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.17068281769752502, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 1.1071248054504395, + "step": 14450 + }, + { + "ce_loss": 0.27436718344688416, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.504375696182251, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.18603618443012238, + "step": 14450 + }, + { + "epoch": 4.8232154769846565, + "loss": 1.1783, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "grad_norm": 9.356254577636719, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "learning_rate": 0.00044226977214337286, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 1.689245581626892, + "step": 14460 + }, + { + "ce_loss": 0.4103279411792755, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 1.0289390087127686, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.1930885761976242, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 1.4320827722549438, + "step": 14460 + }, + { + "ce_loss": 0.34104785323143005, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 0.7984983921051025, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.2489304393529892, + "step": 14460 + }, + { + "epoch": 4.826551034022682, + "loss": 0.9409, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "grad_norm": 1.7776070833206177, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "learning_rate": 0.00044184003611726, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.9860867261886597, + "step": 14470 + }, + { + "ce_loss": 0.25365740060806274, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.337404727935791, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.19550800323486328, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.8277320265769958, + "step": 14470 + }, + { + "ce_loss": 0.2675633132457733, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.3357546329498291, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.18932946026325226, + "step": 14470 + }, + { + "epoch": 4.829886591060707, + "loss": 0.7971, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "grad_norm": 2.111743450164795, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "learning_rate": 0.0004414102512593226, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 0.761222779750824, + "step": 14480 + }, + { + "ce_loss": 0.1951623260974884, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.3789231479167938, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.14075884222984314, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 1.1427268981933594, + "step": 14480 + }, + { + "ce_loss": 0.2484903633594513, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.42528820037841797, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.16807828843593597, + "step": 14480 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.7806, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "grad_norm": 1.6515129804611206, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "learning_rate": 0.0004409804180711662, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.9863893985748291, + "step": 14490 + }, + { + "ce_loss": 0.21996329724788666, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.3274819850921631, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.19819903373718262, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.6463009715080261, + "step": 14490 + }, + { + "ce_loss": 0.15356984734535217, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.31557101011276245, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.13610194623470306, + "step": 14490 + }, + { + "epoch": 4.836557705136758, + "loss": 0.7817, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "grad_norm": 2.982213020324707, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "learning_rate": 0.00044055053705445213, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.7263142466545105, + "step": 14500 + }, + { + "ce_loss": 0.1957697570323944, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.3093082010746002, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.16477113962173462, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.7457721829414368, + "step": 14500 + }, + { + "ce_loss": 0.21496635675430298, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.31695231795310974, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.17643709480762482, + "step": 14500 + }, + { + "epoch": 4.839893262174783, + "loss": 0.795, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "grad_norm": 1.7769856452941895, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "learning_rate": 0.000440120608710898, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 1.05339515209198, + "step": 14510 + }, + { + "ce_loss": 0.21219713985919952, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.3491799533367157, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.14955969154834747, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 0.9170594811439514, + "step": 14510 + }, + { + "ce_loss": 0.3033076822757721, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.36640915274620056, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.20215469598770142, + "step": 14510 + }, + { + "epoch": 4.843228819212809, + "loss": 0.8816, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "grad_norm": 2.1591436862945557, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "learning_rate": 0.0004396906335422763, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.9165817499160767, + "step": 14520 + }, + { + "ce_loss": 0.2852003276348114, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.4034644663333893, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.18268036842346191, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.8185547590255737, + "step": 14520 + }, + { + "ce_loss": 0.20989814400672913, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.3294355273246765, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.17076851427555084, + "step": 14520 + }, + { + "epoch": 4.846564376250834, + "loss": 0.8401, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "grad_norm": 2.5680665969848633, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "learning_rate": 0.00043926061205041444, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 0.6906549334526062, + "step": 14530 + }, + { + "ce_loss": 0.15018831193447113, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.22638709843158722, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.15170030295848846, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 1.0279051065444946, + "step": 14530 + }, + { + "ce_loss": 0.1601780354976654, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.3407314121723175, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.12231296300888062, + "step": 14530 + }, + { + "epoch": 4.849899933288859, + "loss": 0.8612, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "grad_norm": 1.8162392377853394, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "learning_rate": 0.0004388305447371936, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 0.7569298148155212, + "step": 14540 + }, + { + "ce_loss": 0.1900038719177246, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.3315070569515228, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.18197223544120789, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 1.090651512145996, + "step": 14540 + }, + { + "ce_loss": 0.25102272629737854, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.326902836561203, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.17948073148727417, + "step": 14540 + }, + { + "epoch": 4.853235490326885, + "loss": 0.9135, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "grad_norm": 1.8046009540557861, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "learning_rate": 0.00043840043210454873, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.7219487428665161, + "step": 14550 + }, + { + "ce_loss": 0.24534782767295837, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.29573965072631836, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.16810734570026398, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.7448170781135559, + "step": 14550 + }, + { + "ce_loss": 0.20320436358451843, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.34010350704193115, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.15430478751659393, + "step": 14550 + }, + { + "epoch": 4.85657104736491, + "loss": 0.7886, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "grad_norm": 2.4193296432495117, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "learning_rate": 0.0004379702746544675, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.8893520832061768, + "step": 14560 + }, + { + "ce_loss": 0.25183916091918945, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.3263954520225525, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.19890064001083374, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.7266795039176941, + "step": 14560 + }, + { + "ce_loss": 0.17995253205299377, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.34926894307136536, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.14957621693611145, + "step": 14560 + }, + { + "epoch": 4.859906604402935, + "loss": 0.8756, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "grad_norm": 2.1137895584106445, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "learning_rate": 0.00043754007288899013, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.8555729389190674, + "step": 14570 + }, + { + "ce_loss": 0.1920050084590912, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.4450578987598419, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.16308441758155823, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.7460584044456482, + "step": 14570 + }, + { + "ce_loss": 0.20282535254955292, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.38569849729537964, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.1571381390094757, + "step": 14570 + }, + { + "epoch": 4.863242161440961, + "loss": 0.8363, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "grad_norm": 1.1724597215652466, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "learning_rate": 0.00043710982731020806, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.9502582550048828, + "step": 14580 + }, + { + "ce_loss": 0.27301228046417236, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.3538023829460144, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.21068429946899414, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.5929126143455505, + "step": 14580 + }, + { + "ce_loss": 0.14882758259773254, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.27876344323158264, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.13127300143241882, + "step": 14580 + }, + { + "epoch": 4.866577718478986, + "loss": 0.7924, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "grad_norm": 1.5373058319091797, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "learning_rate": 0.0004366795384202644, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.8771259784698486, + "step": 14590 + }, + { + "ce_loss": 0.2793692946434021, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.32669538259506226, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.2203112691640854, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.8765442371368408, + "step": 14590 + }, + { + "ce_loss": 0.2435297966003418, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.37883514165878296, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.16562002897262573, + "step": 14590 + }, + { + "epoch": 4.869913275517011, + "loss": 0.8328, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "grad_norm": 1.6567118167877197, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "learning_rate": 0.0004362492067213526, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 0.8297321796417236, + "step": 14600 + }, + { + "ce_loss": 0.25611981749534607, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.3246353268623352, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.17316275835037231, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 1.1873735189437866, + "step": 14600 + }, + { + "ce_loss": 0.25334230065345764, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.36492976546287537, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.14517346024513245, + "step": 14600 + }, + { + "epoch": 4.873248832555037, + "loss": 0.8133, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "grad_norm": 1.6364003419876099, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "learning_rate": 0.00043581883271571586, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.8985888957977295, + "step": 14610 + }, + { + "ce_loss": 0.22380895912647247, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.3480800986289978, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.18579760193824768, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.880142331123352, + "step": 14610 + }, + { + "ce_loss": 0.22697052359580994, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.36627066135406494, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.14945651590824127, + "step": 14610 + }, + { + "epoch": 4.876584389593062, + "loss": 0.7902, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "grad_norm": 1.6763052940368652, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "learning_rate": 0.0004353884169056472, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 0.8141733407974243, + "step": 14620 + }, + { + "ce_loss": 0.1499975621700287, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.3481467068195343, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.1621495932340622, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 1.2638483047485352, + "step": 14620 + }, + { + "ce_loss": 0.22092504799365997, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.34048253297805786, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.1847701519727707, + "step": 14620 + }, + { + "epoch": 4.879919946631087, + "loss": 0.8409, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "grad_norm": 1.3977830410003662, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "learning_rate": 0.0004349579597934879, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.6937637329101562, + "step": 14630 + }, + { + "ce_loss": 0.23181191086769104, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.2919282615184784, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.16963261365890503, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.7458980083465576, + "step": 14630 + }, + { + "ce_loss": 0.13273221254348755, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.3241432309150696, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.17733998596668243, + "step": 14630 + }, + { + "epoch": 4.883255503669113, + "loss": 0.7683, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "grad_norm": 1.6077567338943481, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "learning_rate": 0.00043452746188162803, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.7748646140098572, + "step": 14640 + }, + { + "ce_loss": 0.1977359652519226, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.3178797662258148, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.20306116342544556, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.8212810158729553, + "step": 14640 + }, + { + "ce_loss": 0.19623126089572906, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.3520052134990692, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.12612716853618622, + "step": 14640 + }, + { + "epoch": 4.886591060707138, + "loss": 0.857, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "grad_norm": 1.7271556854248047, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "learning_rate": 0.0004340969236725046, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.7299498915672302, + "step": 14650 + }, + { + "ce_loss": 0.1944878250360489, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.36202430725097656, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.17322170734405518, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.878600537776947, + "step": 14650 + }, + { + "ce_loss": 0.25492069125175476, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.40417933464050293, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.16458988189697266, + "step": 14650 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.9618, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "grad_norm": 2.2485077381134033, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "learning_rate": 0.0004336663456686026, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.9936636090278625, + "step": 14660 + }, + { + "ce_loss": 0.2919512689113617, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.3717796802520752, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.2658803164958954, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.8192309737205505, + "step": 14660 + }, + { + "ce_loss": 0.16883529722690582, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.3290097415447235, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.14802150428295135, + "step": 14660 + }, + { + "epoch": 4.893262174783189, + "loss": 0.8059, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "grad_norm": 2.89357852935791, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "learning_rate": 0.0004332357283724523, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 1.29874587059021, + "step": 14670 + }, + { + "ce_loss": 0.1667264997959137, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.22527697682380676, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.16906295716762543, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 0.5599315166473389, + "step": 14670 + }, + { + "ce_loss": 0.15608477592468262, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.24146291613578796, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.11661271750926971, + "step": 14670 + }, + { + "epoch": 4.896597731821214, + "loss": 0.7917, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "grad_norm": 1.482553482055664, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "learning_rate": 0.00043280507228663086, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.8754822015762329, + "step": 14680 + }, + { + "ce_loss": 0.10988114774227142, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.22368855774402618, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.1033942922949791, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.8509879112243652, + "step": 14680 + }, + { + "ce_loss": 0.167730450630188, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.25954991579055786, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.18977469205856323, + "step": 14680 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.8584, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "grad_norm": 1.982042908668518, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "learning_rate": 0.00043237437791375993, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.6484662890434265, + "step": 14690 + }, + { + "ce_loss": 0.16622701287269592, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.29128366708755493, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.18995237350463867, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.8472151160240173, + "step": 14690 + }, + { + "ce_loss": 0.20356720685958862, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.2969290316104889, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.15106698870658875, + "step": 14690 + }, + { + "epoch": 4.903268845897265, + "loss": 0.8098, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "grad_norm": 4.553890705108643, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "learning_rate": 0.0004319436457565064, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 0.8041062951087952, + "step": 14700 + }, + { + "ce_loss": 0.20178014039993286, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.3266127109527588, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.1578376144170761, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 1.2332589626312256, + "step": 14700 + }, + { + "ce_loss": 0.22578376531600952, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.31459343433380127, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.18273325264453888, + "step": 14700 + }, + { + "epoch": 4.90660440293529, + "loss": 0.8274, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "grad_norm": 1.8715195655822754, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "learning_rate": 0.00043151287631758094, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.5332189798355103, + "step": 14710 + }, + { + "ce_loss": 0.13371208310127258, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.23901522159576416, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.1602819859981537, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.9503391981124878, + "step": 14710 + }, + { + "ce_loss": 0.21474634110927582, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.34557732939720154, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.1305851936340332, + "step": 14710 + }, + { + "epoch": 4.909939959973316, + "loss": 0.729, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "grad_norm": 1.2429009675979614, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "learning_rate": 0.0004310820700997381, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.7952256202697754, + "step": 14720 + }, + { + "ce_loss": 0.1933000683784485, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.2214220017194748, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.155678853392601, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.7052319049835205, + "step": 14720 + }, + { + "ce_loss": 0.21559998393058777, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.2121732532978058, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.17546199262142181, + "step": 14720 + }, + { + "epoch": 4.913275517011341, + "loss": 0.7112, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "grad_norm": 2.589341878890991, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "learning_rate": 0.0004306512276057746, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.6831985712051392, + "step": 14730 + }, + { + "ce_loss": 0.21680234372615814, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.17091289162635803, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.24290505051612854, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.820207953453064, + "step": 14730 + }, + { + "ce_loss": 0.25783631205558777, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.21073512732982635, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.19787658751010895, + "step": 14730 + }, + { + "epoch": 4.916611074049366, + "loss": 0.6716, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "grad_norm": 1.1120072603225708, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "learning_rate": 0.0004302203493385306, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.569556474685669, + "step": 14740 + }, + { + "ce_loss": 0.16752736270427704, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.18688969314098358, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.21498115360736847, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.5875118970870972, + "step": 14740 + }, + { + "ce_loss": 0.24991795420646667, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.15244053304195404, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.18492494523525238, + "step": 14740 + }, + { + "epoch": 4.919946631087392, + "loss": 0.6869, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "grad_norm": 2.2955188751220703, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "learning_rate": 0.00042978943580088683, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.6526211500167847, + "step": 14750 + }, + { + "ce_loss": 0.25695958733558655, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.2014576494693756, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.1937873214483261, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.6520219445228577, + "step": 14750 + }, + { + "ce_loss": 0.18020305037498474, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.19618704915046692, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.16471585631370544, + "step": 14750 + }, + { + "epoch": 4.923282188125417, + "loss": 0.667, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "grad_norm": 1.4162378311157227, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "learning_rate": 0.00042935848749576605, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.8888968825340271, + "step": 14760 + }, + { + "ce_loss": 0.2973913252353668, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.20532050728797913, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.19992171227931976, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.5608199834823608, + "step": 14760 + }, + { + "ce_loss": 0.21077339351177216, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.16744637489318848, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.14417974650859833, + "step": 14760 + }, + { + "epoch": 4.926617745163442, + "loss": 0.7523, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "grad_norm": 1.8813337087631226, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "learning_rate": 0.00042892750492613124, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.8061093091964722, + "step": 14770 + }, + { + "ce_loss": 0.21497511863708496, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.3684411942958832, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.17852698266506195, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.8953151702880859, + "step": 14770 + }, + { + "ce_loss": 0.27103355526924133, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.44409942626953125, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.17995832860469818, + "step": 14770 + }, + { + "epoch": 4.929953302201468, + "loss": 0.8933, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "grad_norm": 1.758418083190918, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "learning_rate": 0.00042849648859498554, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.9538238048553467, + "step": 14780 + }, + { + "ce_loss": 0.2814600169658661, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.46908703446388245, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.15025387704372406, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.6634703278541565, + "step": 14780 + }, + { + "ce_loss": 0.17408856749534607, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.3205197751522064, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.13817840814590454, + "step": 14780 + }, + { + "epoch": 4.933288859239493, + "loss": 0.8628, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "grad_norm": 2.2300400733947754, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "learning_rate": 0.0004280654390053712, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 1.016411304473877, + "step": 14790 + }, + { + "ce_loss": 0.2313816100358963, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.4427144229412079, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.20980128645896912, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 0.8823920488357544, + "step": 14790 + }, + { + "ce_loss": 0.24262329936027527, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.4533618986606598, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.18566647171974182, + "step": 14790 + }, + { + "epoch": 4.936624416277518, + "loss": 1.0459, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "grad_norm": 6.204045295715332, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "learning_rate": 0.00042763435666036973, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 1.3538483381271362, + "step": 14800 + }, + { + "ce_loss": 0.3704322576522827, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.7152073979377747, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.2093076854944229, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 0.9754691123962402, + "step": 14800 + }, + { + "ce_loss": 0.24051721394062042, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.5091836452484131, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.17049235105514526, + "step": 14800 + }, + { + "epoch": 4.939959973315544, + "loss": 1.0958, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "grad_norm": 3.4569523334503174, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "learning_rate": 0.0004272032420631003, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 0.8569350242614746, + "step": 14810 + }, + { + "ce_loss": 0.25664758682250977, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.39706772565841675, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.16805876791477203, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 1.6937330961227417, + "step": 14810 + }, + { + "ce_loss": 0.2796967625617981, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.9483956694602966, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.2175309658050537, + "step": 14810 + }, + { + "epoch": 4.943295530353569, + "loss": 1.0918, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "grad_norm": 2.689622163772583, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "learning_rate": 0.0004267720957167202, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.7044408917427063, + "step": 14820 + }, + { + "ce_loss": 0.17607758939266205, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.3482334017753601, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.17931784689426422, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.9181632399559021, + "step": 14820 + }, + { + "ce_loss": 0.260895699262619, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.31426650285720825, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.19637629389762878, + "step": 14820 + }, + { + "epoch": 4.946631087391594, + "loss": 0.8052, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "grad_norm": 1.5513012409210205, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "learning_rate": 0.0004263409181244236, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 0.7607131004333496, + "step": 14830 + }, + { + "ce_loss": 0.25896647572517395, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.34560784697532654, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.15555359423160553, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 1.0125609636306763, + "step": 14830 + }, + { + "ce_loss": 0.225737527012825, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.38140517473220825, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.19740155339241028, + "step": 14830 + }, + { + "epoch": 4.94996664442962, + "loss": 0.7785, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "grad_norm": 1.6204591989517212, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "learning_rate": 0.00042590970978944134, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 0.651091992855072, + "step": 14840 + }, + { + "ce_loss": 0.17360687255859375, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.30723288655281067, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.1698845624923706, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 1.0454457998275757, + "step": 14840 + }, + { + "ce_loss": 0.2976030111312866, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.36757364869117737, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.25793442130088806, + "step": 14840 + }, + { + "epoch": 4.953302201467645, + "loss": 0.8274, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "grad_norm": 1.1396735906600952, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "learning_rate": 0.00042547847121503956, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.7016441226005554, + "step": 14850 + }, + { + "ce_loss": 0.197813481092453, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.3064996600151062, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.14802461862564087, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.8446516990661621, + "step": 14850 + }, + { + "ce_loss": 0.19737334549427032, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.23980382084846497, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.1891128122806549, + "step": 14850 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.8137, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "grad_norm": 1.8514357805252075, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "learning_rate": 0.0004250472029045204, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.7438077330589294, + "step": 14860 + }, + { + "ce_loss": 0.2085508406162262, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.3794730305671692, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.15562745928764343, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.595439076423645, + "step": 14860 + }, + { + "ce_loss": 0.16566596925258636, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.3100895583629608, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.11954033374786377, + "step": 14860 + }, + { + "epoch": 4.959973315543696, + "loss": 0.7921, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "grad_norm": 2.2713711261749268, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "learning_rate": 0.00042461590536122017, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.7876851558685303, + "step": 14870 + }, + { + "ce_loss": 0.21698933839797974, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.28425318002700806, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.15588748455047607, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.872055172920227, + "step": 14870 + }, + { + "ce_loss": 0.23964551091194153, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.37298545241355896, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.18387804925441742, + "step": 14870 + }, + { + "epoch": 4.963308872581721, + "loss": 0.807, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "grad_norm": 1.6793760061264038, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "learning_rate": 0.0004241845790885096, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 1.0691640377044678, + "step": 14880 + }, + { + "ce_loss": 0.25611549615859985, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.361844539642334, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.17869262397289276, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 1.216537356376648, + "step": 14880 + }, + { + "ce_loss": 0.247919499874115, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.3699841797351837, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.1814611852169037, + "step": 14880 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.8803, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "grad_norm": 1.9350767135620117, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "learning_rate": 0.00042375322458979286, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 1.2860150337219238, + "step": 14890 + }, + { + "ce_loss": 0.18326349556446075, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.3313220739364624, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.1453424096107483, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 1.0579900741577148, + "step": 14890 + }, + { + "ce_loss": 0.23096811771392822, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.4033205211162567, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.1717856526374817, + "step": 14890 + }, + { + "epoch": 4.969979986657772, + "loss": 0.8604, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "grad_norm": 1.4258919954299927, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "learning_rate": 0.00042332184236850714, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.6514805555343628, + "step": 14900 + }, + { + "ce_loss": 0.17925575375556946, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.2957485020160675, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.1762385070323944, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.9138930439949036, + "step": 14900 + }, + { + "ce_loss": 0.2070235162973404, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.3828832507133484, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.16424107551574707, + "step": 14900 + }, + { + "epoch": 4.973315543695797, + "loss": 0.7769, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "grad_norm": 2.2698750495910645, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "learning_rate": 0.00042289043292812183, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.8078776001930237, + "step": 14910 + }, + { + "ce_loss": 0.19389958679676056, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.4422124922275543, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.17119190096855164, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.7781140208244324, + "step": 14910 + }, + { + "ce_loss": 0.207720085978508, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.3681999444961548, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.2020249366760254, + "step": 14910 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.7858, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "grad_norm": 1.6022636890411377, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "learning_rate": 0.00042245899677213804, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.7355327010154724, + "step": 14920 + }, + { + "ce_loss": 0.24947616457939148, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.28465527296066284, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.15694041550159454, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.961696445941925, + "step": 14920 + }, + { + "ce_loss": 0.22658829391002655, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.23476535081863403, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.22592884302139282, + "step": 14920 + }, + { + "epoch": 4.979986657771848, + "loss": 0.7838, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "grad_norm": 1.8493866920471191, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "learning_rate": 0.0004220275344040885, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.6180217862129211, + "step": 14930 + }, + { + "ce_loss": 0.23227348923683167, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.2516789138317108, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.13387711346149445, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.7691706418991089, + "step": 14930 + }, + { + "ce_loss": 0.08289957791566849, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.18975229561328888, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.10202156007289886, + "step": 14930 + }, + { + "epoch": 4.983322214809873, + "loss": 0.8085, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "grad_norm": 2.2217841148376465, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "learning_rate": 0.00042159604632753593, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.8267608284950256, + "step": 14940 + }, + { + "ce_loss": 0.2368335723876953, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.28960832953453064, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.18242600560188293, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.7490988969802856, + "step": 14940 + }, + { + "ce_loss": 0.1887538582086563, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.2502892315387726, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.14834783971309662, + "step": 14940 + }, + { + "epoch": 4.986657771847899, + "loss": 0.8345, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "grad_norm": 3.3127646446228027, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "learning_rate": 0.0004211645330460736, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.7695230841636658, + "step": 14950 + }, + { + "ce_loss": 0.1897069364786148, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.32701951265335083, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.17356127500534058, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.8446176052093506, + "step": 14950 + }, + { + "ce_loss": 0.18127408623695374, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.3553479313850403, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.17855243384838104, + "step": 14950 + }, + { + "epoch": 4.989993328885924, + "loss": 0.8408, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "grad_norm": 1.8860888481140137, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "learning_rate": 0.0004207329950633237, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.6589542627334595, + "step": 14960 + }, + { + "ce_loss": 0.18732410669326782, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.27363622188568115, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.1468590348958969, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.603113055229187, + "step": 14960 + }, + { + "ce_loss": 0.1368173062801361, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.31260940432548523, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.11893227696418762, + "step": 14960 + }, + { + "epoch": 4.993328885923949, + "loss": 0.8121, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "grad_norm": 1.615979790687561, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "learning_rate": 0.0004203014328829377, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.8362368941307068, + "step": 14970 + }, + { + "ce_loss": 0.28994032740592957, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.3475920557975769, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.19849959015846252, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.8362758159637451, + "step": 14970 + }, + { + "ce_loss": 0.23408402502536774, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.3129236102104187, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.17037709057331085, + "step": 14970 + }, + { + "epoch": 4.996664442961975, + "loss": 0.7861, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "grad_norm": 9.854972839355469, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "learning_rate": 0.00041986984700859473, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.7983810901641846, + "step": 14980 + }, + { + "ce_loss": 0.22925935685634613, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.2890111804008484, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.17651940882205963, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.7387625575065613, + "step": 14980 + }, + { + "ce_loss": 0.20382659137248993, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.3147651255130768, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.15080402791500092, + "step": 14980 + }, + { + "epoch": 5.0, + "loss": 0.8124, + "step": 14990 + }, + { + "epoch": 5.0, + "grad_norm": 1.7010997533798218, + "step": 14990 + }, + { + "epoch": 5.0, + "learning_rate": 0.00041943823794400256, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.5493976473808289, + "step": 14990 + }, + { + "ce_loss": 0.12174477428197861, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.29600775241851807, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.13114362955093384, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.6701236963272095, + "step": 14990 + }, + { + "ce_loss": 0.1493186503648758, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.29544782638549805, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.125166118144989, + "step": 14990 + }, + { + "epoch": 5.003335557038025, + "loss": 0.6878, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "grad_norm": 1.6340457201004028, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "learning_rate": 0.0004190066061928949, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.7245055437088013, + "step": 15000 + }, + { + "ce_loss": 0.32626980543136597, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.15379448235034943, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.19659432768821716, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.5948415994644165, + "step": 15000 + }, + { + "ce_loss": 0.279764324426651, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.1267262101173401, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.187013179063797, + "step": 15000 + }, + { + "epoch": 5.006671114076051, + "loss": 0.7193, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "grad_norm": 2.0343847274780273, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "learning_rate": 0.0004185749522590327, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.5516359210014343, + "step": 15010 + }, + { + "ce_loss": 0.1559305191040039, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.2148263156414032, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.18032346665859222, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.6730528473854065, + "step": 15010 + }, + { + "ce_loss": 0.256583571434021, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.23742398619651794, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.15251505374908447, + "step": 15010 + }, + { + "epoch": 5.010006671114076, + "loss": 0.7347, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "grad_norm": 2.410008430480957, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "learning_rate": 0.00041814327664620236, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.9244962930679321, + "step": 15020 + }, + { + "ce_loss": 0.12703068554401398, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.3417317271232605, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.16752563416957855, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.6968506574630737, + "step": 15020 + }, + { + "ce_loss": 0.17538586258888245, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.3248499035835266, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.15323364734649658, + "step": 15020 + }, + { + "epoch": 5.013342228152101, + "loss": 0.8062, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "grad_norm": 1.9668642282485962, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "learning_rate": 0.00041771157985821583, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.7335589528083801, + "step": 15030 + }, + { + "ce_loss": 0.185219407081604, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.35626813769340515, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.19170069694519043, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.8919950127601624, + "step": 15030 + }, + { + "ce_loss": 0.17872750759124756, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.3545999825000763, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.1883302479982376, + "step": 15030 + }, + { + "epoch": 5.016677785190127, + "loss": 0.7265, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "grad_norm": 1.4379287958145142, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "learning_rate": 0.0004172798623989099, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.7448151707649231, + "step": 15040 + }, + { + "ce_loss": 0.20049512386322021, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.34122711420059204, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.12881450355052948, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.7172622680664062, + "step": 15040 + }, + { + "ce_loss": 0.12092036753892899, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.21639123558998108, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.10656973719596863, + "step": 15040 + }, + { + "epoch": 5.020013342228152, + "loss": 0.7088, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "grad_norm": 1.9609133005142212, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "learning_rate": 0.00041684812477214513, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.6304663419723511, + "step": 15050 + }, + { + "ce_loss": 0.1168755292892456, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.2414208948612213, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.11683718115091324, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.8791097402572632, + "step": 15050 + }, + { + "ce_loss": 0.14968617260456085, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.299162894487381, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.12304960936307907, + "step": 15050 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.7655, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "grad_norm": 2.369798183441162, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "learning_rate": 0.0004164163674818058, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.7249473929405212, + "step": 15060 + }, + { + "ce_loss": 0.19053047895431519, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.3092261254787445, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.16751517355442047, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.7884148359298706, + "step": 15060 + }, + { + "ce_loss": 0.11927471309900284, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.30460503697395325, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.1316409558057785, + "step": 15060 + }, + { + "epoch": 5.026684456304203, + "loss": 0.757, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "grad_norm": 1.569125771522522, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "learning_rate": 0.00041598459103179923, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.7768588066101074, + "step": 15070 + }, + { + "ce_loss": 0.17738273739814758, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.33382323384284973, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.1485685408115387, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.7797510623931885, + "step": 15070 + }, + { + "ce_loss": 0.20250549912452698, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.2760103940963745, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.17828702926635742, + "step": 15070 + }, + { + "epoch": 5.030020013342228, + "loss": 0.8597, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "grad_norm": 1.8850408792495728, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "learning_rate": 0.0004155527959260548, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.9354071021080017, + "step": 15080 + }, + { + "ce_loss": 0.2691536247730255, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.4211371839046478, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.1899864375591278, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.5601158142089844, + "step": 15080 + }, + { + "ce_loss": 0.14419464766979218, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.2756442129611969, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.1400666981935501, + "step": 15080 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.7297, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "grad_norm": 1.7295362949371338, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "learning_rate": 0.0004151209826685239, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.7522996068000793, + "step": 15090 + }, + { + "ce_loss": 0.21195000410079956, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.333752304315567, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.16549304127693176, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.6685563921928406, + "step": 15090 + }, + { + "ce_loss": 0.14069870114326477, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.3408507704734802, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.14039243757724762, + "step": 15090 + }, + { + "epoch": 5.036691127418279, + "loss": 0.7199, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "grad_norm": 2.117892026901245, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "learning_rate": 0.00041468915176317927, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.5341494083404541, + "step": 15100 + }, + { + "ce_loss": 0.1536131352186203, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.2793445289134979, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.1009863018989563, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.7302848100662231, + "step": 15100 + }, + { + "ce_loss": 0.1963079273700714, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.33204758167266846, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.15897220373153687, + "step": 15100 + }, + { + "epoch": 5.040026684456304, + "loss": 0.7521, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "grad_norm": 2.443079710006714, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "learning_rate": 0.00041425730371401397, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 0.718482494354248, + "step": 15110 + }, + { + "ce_loss": 0.20535221695899963, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.3341054320335388, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.1788957566022873, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 1.0382378101348877, + "step": 15110 + }, + { + "ce_loss": 0.2585584819316864, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.39312127232551575, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.17663244903087616, + "step": 15110 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.8617, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "grad_norm": 2.586002826690674, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "learning_rate": 0.000413825439025041, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.9322474002838135, + "step": 15120 + }, + { + "ce_loss": 0.17946970462799072, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.38679298758506775, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.12891161441802979, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.7245580554008484, + "step": 15120 + }, + { + "ce_loss": 0.18467949330806732, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.3750002384185791, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.14669382572174072, + "step": 15120 + }, + { + "epoch": 5.046697798532355, + "loss": 0.7683, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "grad_norm": 2.1405930519104004, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "learning_rate": 0.0004133935582002931, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.7090080380439758, + "step": 15130 + }, + { + "ce_loss": 0.18841010332107544, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.3492705523967743, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.1372767835855484, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.7359943985939026, + "step": 15130 + }, + { + "ce_loss": 0.19403263926506042, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.3826109766960144, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.1589689403772354, + "step": 15130 + }, + { + "epoch": 5.05003335557038, + "loss": 0.8044, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "grad_norm": 1.4716843366622925, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "learning_rate": 0.0004129616617438214, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 1.0415242910385132, + "step": 15140 + }, + { + "ce_loss": 0.19426783919334412, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.3080959618091583, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.16131074726581573, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 0.5969170928001404, + "step": 15140 + }, + { + "ce_loss": 0.1542249321937561, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.23913876712322235, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.1418016105890274, + "step": 15140 + }, + { + "epoch": 5.053368912608406, + "loss": 0.7259, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "grad_norm": 1.4115509986877441, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "learning_rate": 0.0004125297501596958, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.8852753043174744, + "step": 15150 + }, + { + "ce_loss": 0.2032100260257721, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.3799605071544647, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.15284423530101776, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.7631134390830994, + "step": 15150 + }, + { + "ce_loss": 0.23339112102985382, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.3507590889930725, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.17875313758850098, + "step": 15150 + }, + { + "epoch": 5.056704469646431, + "loss": 0.7739, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "grad_norm": 1.3898341655731201, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "learning_rate": 0.0004120978239520035, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.7077916264533997, + "step": 15160 + }, + { + "ce_loss": 0.1837417483329773, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.30820026993751526, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.1538972705602646, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.70729660987854, + "step": 15160 + }, + { + "ce_loss": 0.14605118334293365, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.28691962361335754, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.13982750475406647, + "step": 15160 + }, + { + "epoch": 5.060040026684456, + "loss": 0.752, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "grad_norm": 1.3309475183486938, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "learning_rate": 0.0004116658836248489, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.6324098706245422, + "step": 15170 + }, + { + "ce_loss": 0.16235965490341187, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.3043210506439209, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.13393297791481018, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.8985719680786133, + "step": 15170 + }, + { + "ce_loss": 0.22842848300933838, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.37544792890548706, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.20678819715976715, + "step": 15170 + }, + { + "epoch": 5.063375583722482, + "loss": 0.7813, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "grad_norm": 1.6714725494384766, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "learning_rate": 0.00041123392968235275, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.8342439532279968, + "step": 15180 + }, + { + "ce_loss": 0.12360727041959763, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.2871699333190918, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.13866832852363586, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.8737316727638245, + "step": 15180 + }, + { + "ce_loss": 0.2810107171535492, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.3678017854690552, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.1715250015258789, + "step": 15180 + }, + { + "epoch": 5.066711140760507, + "loss": 0.7547, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "grad_norm": 1.899660348892212, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "learning_rate": 0.00041080196262865195, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.8878491520881653, + "step": 15190 + }, + { + "ce_loss": 0.20929262042045593, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.3679594099521637, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.2262931913137436, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.5590569972991943, + "step": 15190 + }, + { + "ce_loss": 0.14374060928821564, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.28797242045402527, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.1269454061985016, + "step": 15190 + }, + { + "epoch": 5.070046697798532, + "loss": 0.7276, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "grad_norm": 1.7899247407913208, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "learning_rate": 0.0004103699829678983, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.8208807706832886, + "step": 15200 + }, + { + "ce_loss": 0.14580045640468597, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.3477942943572998, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.14927612245082855, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.6615450978279114, + "step": 15200 + }, + { + "ce_loss": 0.18760168552398682, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.3171888589859009, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.11915821582078934, + "step": 15200 + }, + { + "epoch": 5.073382254836558, + "loss": 0.8015, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "grad_norm": 1.3192826509475708, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "learning_rate": 0.00040993799120425873, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.7259508967399597, + "step": 15210 + }, + { + "ce_loss": 0.17098185420036316, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.3219786286354065, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.14518868923187256, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.9862769842147827, + "step": 15210 + }, + { + "ce_loss": 0.14905671775341034, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.39070963859558105, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.13869890570640564, + "step": 15210 + }, + { + "epoch": 5.076717811874583, + "loss": 0.7702, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "grad_norm": 2.40421724319458, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "learning_rate": 0.000409505987841914, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.796295166015625, + "step": 15220 + }, + { + "ce_loss": 0.1610361784696579, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.3039185106754303, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.20260609686374664, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.9617740511894226, + "step": 15220 + }, + { + "ce_loss": 0.22738519310951233, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.3633210361003876, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.16509735584259033, + "step": 15220 + }, + { + "epoch": 5.080053368912608, + "loss": 0.7737, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "grad_norm": 2.2806813716888428, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "learning_rate": 0.0004090739733850587, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.6277940273284912, + "step": 15230 + }, + { + "ce_loss": 0.1376854032278061, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.34044697880744934, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.11363232880830765, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.6917346119880676, + "step": 15230 + }, + { + "ce_loss": 0.20212016999721527, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.31108105182647705, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.16628040373325348, + "step": 15230 + }, + { + "epoch": 5.083388925950634, + "loss": 0.7025, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "grad_norm": 2.196148157119751, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "learning_rate": 0.00040864194833789997, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 1.0141888856887817, + "step": 15240 + }, + { + "ce_loss": 0.16802021861076355, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.32726022601127625, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.16862088441848755, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 0.8400261998176575, + "step": 15240 + }, + { + "ce_loss": 0.15872463583946228, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.31594353914260864, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.15056195855140686, + "step": 15240 + }, + { + "epoch": 5.086724482988659, + "loss": 0.7916, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "grad_norm": 1.3587056398391724, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "learning_rate": 0.0004082099132046575, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.6186241507530212, + "step": 15250 + }, + { + "ce_loss": 0.1958022266626358, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.2964455485343933, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.12577207386493683, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.7843382954597473, + "step": 15250 + }, + { + "ce_loss": 0.1981726884841919, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.3239561915397644, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.14233283698558807, + "step": 15250 + }, + { + "epoch": 5.090060040026684, + "loss": 0.6869, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "grad_norm": 1.2597806453704834, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "learning_rate": 0.00040777786848956304, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.6707214117050171, + "step": 15260 + }, + { + "ce_loss": 0.16234301030635834, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.27243322134017944, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.18407489359378815, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.7344317436218262, + "step": 15260 + }, + { + "ce_loss": 0.16686370968818665, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.31479838490486145, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.14425840973854065, + "step": 15260 + }, + { + "epoch": 5.09339559706471, + "loss": 0.7113, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "grad_norm": 2.707850694656372, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "learning_rate": 0.00040734581469685906, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.787645697593689, + "step": 15270 + }, + { + "ce_loss": 0.18194088339805603, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.3009050786495209, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.14698223769664764, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.8195387721061707, + "step": 15270 + }, + { + "ce_loss": 0.17880409955978394, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.2749001979827881, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.14499787986278534, + "step": 15270 + }, + { + "epoch": 5.096731154102735, + "loss": 0.7105, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "grad_norm": 1.1744245290756226, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "learning_rate": 0.00040691375233079907, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.8375328183174133, + "step": 15280 + }, + { + "ce_loss": 0.21677394211292267, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.2636817991733551, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.17519797384738922, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.4660475254058838, + "step": 15280 + }, + { + "ce_loss": 0.14812767505645752, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.19933080673217773, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.11839006096124649, + "step": 15280 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.6638, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "grad_norm": 2.339646339416504, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "learning_rate": 0.00040648168189564595, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.7594683170318604, + "step": 15290 + }, + { + "ce_loss": 0.22392548620700836, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.3010225296020508, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.15817376971244812, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.6978979706764221, + "step": 15290 + }, + { + "ce_loss": 0.24014516174793243, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.2784227132797241, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.17909127473831177, + "step": 15290 + }, + { + "epoch": 5.103402268178786, + "loss": 0.7545, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "grad_norm": 1.8751845359802246, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "learning_rate": 0.00040604960389567274, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.528994619846344, + "step": 15300 + }, + { + "ce_loss": 0.1152702048420906, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.2503540515899658, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.12252363562583923, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.8748533129692078, + "step": 15300 + }, + { + "ce_loss": 0.20872852206230164, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.43009328842163086, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.16917204856872559, + "step": 15300 + }, + { + "epoch": 5.106737825216811, + "loss": 0.679, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "grad_norm": 1.5614938735961914, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "learning_rate": 0.00040561751883516064, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.7292320728302002, + "step": 15310 + }, + { + "ce_loss": 0.1953887790441513, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.35018715262413025, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.13770712912082672, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.7030383348464966, + "step": 15310 + }, + { + "ce_loss": 0.15858271718025208, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.2966959476470947, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.1796363741159439, + "step": 15310 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.7627, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "grad_norm": 1.8307886123657227, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "learning_rate": 0.00040518542721839967, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 1.134617567062378, + "step": 15320 + }, + { + "ce_loss": 0.27231743931770325, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.44714435935020447, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.19462838768959045, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.6627607345581055, + "step": 15320 + }, + { + "ce_loss": 0.15691936016082764, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.2951563000679016, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.12084396928548813, + "step": 15320 + }, + { + "epoch": 5.113408939292862, + "loss": 0.8722, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "grad_norm": 2.4782094955444336, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "learning_rate": 0.00040475332954968723, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.8449116945266724, + "step": 15330 + }, + { + "ce_loss": 0.2228572964668274, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.3963652551174164, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.16229957342147827, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.5537840723991394, + "step": 15330 + }, + { + "ce_loss": 0.10099782794713974, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.278985857963562, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.12842731177806854, + "step": 15330 + }, + { + "epoch": 5.116744496330887, + "loss": 0.8228, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "grad_norm": 2.2996766567230225, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "learning_rate": 0.0004043212263333277, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.8240509033203125, + "step": 15340 + }, + { + "ce_loss": 0.24200014770030975, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.36865487694740295, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.1704481840133667, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.5815379023551941, + "step": 15340 + }, + { + "ce_loss": 0.13422711193561554, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.3356119394302368, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.11077725142240524, + "step": 15340 + }, + { + "epoch": 5.120080053368913, + "loss": 0.7924, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "grad_norm": 1.4737706184387207, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "learning_rate": 0.000403889118073632, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.7064817547798157, + "step": 15350 + }, + { + "ce_loss": 0.17357517778873444, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.26995182037353516, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.19605916738510132, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.4547923505306244, + "step": 15350 + }, + { + "ce_loss": 0.13223117589950562, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.19939839839935303, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.12301374971866608, + "step": 15350 + }, + { + "epoch": 5.123415610406938, + "loss": 0.6784, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "grad_norm": 1.4126379489898682, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "learning_rate": 0.00040345700527491703, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.7251626253128052, + "step": 15360 + }, + { + "ce_loss": 0.20589450001716614, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.3297916650772095, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.13303814828395844, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.5926677584648132, + "step": 15360 + }, + { + "ce_loss": 0.14590634405612946, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.2872351408004761, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.12017708271741867, + "step": 15360 + }, + { + "epoch": 5.126751167444963, + "loss": 0.6853, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "grad_norm": 1.6726282835006714, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "learning_rate": 0.0004030248884415049, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.6812902688980103, + "step": 15370 + }, + { + "ce_loss": 0.14721952378749847, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.27806055545806885, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.11692876368761063, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.7496767640113831, + "step": 15370 + }, + { + "ce_loss": 0.18391594290733337, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.3393586277961731, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.14704890549182892, + "step": 15370 + }, + { + "epoch": 5.130086724482989, + "loss": 0.7202, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "grad_norm": 1.6118098497390747, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "learning_rate": 0.00040259276807772264, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.6544176936149597, + "step": 15380 + }, + { + "ce_loss": 0.1918047070503235, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.27302777767181396, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.15166480839252472, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.6611236929893494, + "step": 15380 + }, + { + "ce_loss": 0.2007049024105072, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.2729007303714752, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.16753646731376648, + "step": 15380 + }, + { + "epoch": 5.133422281521014, + "loss": 0.7482, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "grad_norm": 2.3332631587982178, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "learning_rate": 0.0004021606446879008, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.7054694890975952, + "step": 15390 + }, + { + "ce_loss": 0.1783827692270279, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.3041553497314453, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.1363864243030548, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.6848443150520325, + "step": 15390 + }, + { + "ce_loss": 0.15475906431674957, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.3089504837989807, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.12166281044483185, + "step": 15390 + }, + { + "epoch": 5.136757838559039, + "loss": 0.7116, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "grad_norm": 1.8791468143463135, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "learning_rate": 0.00040172851877637425, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 0.8204621076583862, + "step": 15400 + }, + { + "ce_loss": 0.18681691586971283, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.31471189856529236, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.14055274426937103, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 1.2355568408966064, + "step": 15400 + }, + { + "ce_loss": 0.22752253711223602, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.30248481035232544, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.1581341177225113, + "step": 15400 + }, + { + "epoch": 5.140093395597065, + "loss": 0.763, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "grad_norm": 3.6763594150543213, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "learning_rate": 0.00040129639084748034, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.7683576345443726, + "step": 15410 + }, + { + "ce_loss": 0.19131425023078918, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.30612730979919434, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.21812304854393005, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.546682596206665, + "step": 15410 + }, + { + "ce_loss": 0.15684723854064941, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.22708380222320557, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.12680214643478394, + "step": 15410 + }, + { + "epoch": 5.14342895263509, + "loss": 0.6825, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "grad_norm": 1.8578838109970093, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "learning_rate": 0.0004008642614055586, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.5246071815490723, + "step": 15420 + }, + { + "ce_loss": 0.13065984845161438, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.2596120834350586, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.09721403568983078, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.9444109797477722, + "step": 15420 + }, + { + "ce_loss": 0.20307934284210205, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.3659123480319977, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.17617152631282806, + "step": 15420 + }, + { + "epoch": 5.146764509673115, + "loss": 0.7336, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "grad_norm": 1.9291861057281494, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "learning_rate": 0.0004004321309549511, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.7223341464996338, + "step": 15430 + }, + { + "ce_loss": 0.15882186591625214, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.3048314154148102, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.12695194780826569, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.6615356206893921, + "step": 15430 + }, + { + "ce_loss": 0.15503180027008057, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.3699668049812317, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.1364234983921051, + "step": 15430 + }, + { + "epoch": 5.150100066711141, + "loss": 0.7307, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "grad_norm": 1.8029311895370483, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "learning_rate": 0.0004, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 1.0619946718215942, + "step": 15440 + }, + { + "ce_loss": 0.27869516611099243, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.39487215876579285, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.20835605263710022, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 0.7138598561286926, + "step": 15440 + }, + { + "ce_loss": 0.21639475226402283, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.32156020402908325, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.1755463182926178, + "step": 15440 + }, + { + "epoch": 5.153435623749166, + "loss": 0.7177, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "grad_norm": 1.7701245546340942, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "learning_rate": 0.000399567869045049, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.6437214016914368, + "step": 15450 + }, + { + "ce_loss": 0.16194361448287964, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.283201664686203, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.15246915817260742, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.7803009748458862, + "step": 15450 + }, + { + "ce_loss": 0.14901769161224365, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.2672480344772339, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.16708533465862274, + "step": 15450 + }, + { + "epoch": 5.156771180787191, + "loss": 0.725, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "grad_norm": 1.2688924074172974, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "learning_rate": 0.0003991357385944414, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.581249475479126, + "step": 15460 + }, + { + "ce_loss": 0.18468716740608215, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.25455421209335327, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.14176595211029053, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.5864779949188232, + "step": 15460 + }, + { + "ce_loss": 0.16304655373096466, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.2759646773338318, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.12296140938997269, + "step": 15460 + }, + { + "epoch": 5.160106737825217, + "loss": 0.7828, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "grad_norm": 1.317644715309143, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "learning_rate": 0.0003987036091525198, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.6303431987762451, + "step": 15470 + }, + { + "ce_loss": 0.16451367735862732, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.2889218032360077, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.13916608691215515, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.7348785400390625, + "step": 15470 + }, + { + "ce_loss": 0.18091663718223572, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.3877863883972168, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.12812453508377075, + "step": 15470 + }, + { + "epoch": 5.163442294863242, + "loss": 0.7009, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "grad_norm": 1.2215092182159424, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "learning_rate": 0.00039827148122362584, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 1.113275170326233, + "step": 15480 + }, + { + "ce_loss": 0.21751204133033752, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.2767632007598877, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.15298031270503998, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 0.6497124433517456, + "step": 15480 + }, + { + "ce_loss": 0.19662845134735107, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.25942057371139526, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.1508607417345047, + "step": 15480 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.6631, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "grad_norm": 1.4898862838745117, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "learning_rate": 0.0003978393553120993, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.6173375248908997, + "step": 15490 + }, + { + "ce_loss": 0.11945261061191559, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.26320600509643555, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.13641858100891113, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.6017664074897766, + "step": 15490 + }, + { + "ce_loss": 0.17880946397781372, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.25818952918052673, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.12212560325860977, + "step": 15490 + }, + { + "epoch": 5.170113408939293, + "loss": 0.7009, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "grad_norm": 1.657698631286621, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "learning_rate": 0.0003974072319222774, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.7776429653167725, + "step": 15500 + }, + { + "ce_loss": 0.21835565567016602, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.3262067437171936, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.1799788922071457, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.6015152335166931, + "step": 15500 + }, + { + "ce_loss": 0.1296067237854004, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.3558272421360016, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.11590032279491425, + "step": 15500 + }, + { + "epoch": 5.173448965977318, + "loss": 0.6985, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "grad_norm": 1.3140312433242798, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "learning_rate": 0.00039697511155849507, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.6865147948265076, + "step": 15510 + }, + { + "ce_loss": 0.2025403380393982, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.2703585624694824, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.1497085988521576, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.5102109909057617, + "step": 15510 + }, + { + "ce_loss": 0.13412393629550934, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.21918663382530212, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.11139141768217087, + "step": 15510 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.6942, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "grad_norm": 2.086806297302246, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "learning_rate": 0.00039654299472508296, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.7411342859268188, + "step": 15520 + }, + { + "ce_loss": 0.18614676594734192, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.3071932792663574, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.13869841396808624, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.6899371147155762, + "step": 15520 + }, + { + "ce_loss": 0.2027830183506012, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.257847398519516, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.18891122937202454, + "step": 15520 + }, + { + "epoch": 5.180120080053369, + "loss": 0.7278, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "grad_norm": 1.283077359199524, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "learning_rate": 0.0003961108819263681, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.6969683170318604, + "step": 15530 + }, + { + "ce_loss": 0.13482505083084106, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.21009461581707, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.13851168751716614, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.47756150364875793, + "step": 15530 + }, + { + "ce_loss": 0.11805874854326248, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.20090797543525696, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.12400776147842407, + "step": 15530 + }, + { + "epoch": 5.183455637091394, + "loss": 0.6822, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "grad_norm": 1.3494940996170044, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "learning_rate": 0.00039567877366667234, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.6591233015060425, + "step": 15540 + }, + { + "ce_loss": 0.25253763794898987, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.26830992102622986, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.1380598396062851, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.6860598921775818, + "step": 15540 + }, + { + "ce_loss": 0.17180071771144867, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.2451307773590088, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.1696072220802307, + "step": 15540 + }, + { + "epoch": 5.18679119412942, + "loss": 0.6521, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "grad_norm": 2.229048252105713, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "learning_rate": 0.00039524667045031287, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.9138002395629883, + "step": 15550 + }, + { + "ce_loss": 0.18019822239875793, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.24058149755001068, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.15312932431697845, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.7437194585800171, + "step": 15550 + }, + { + "ce_loss": 0.17918673157691956, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.2970272898674011, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.12216797471046448, + "step": 15550 + }, + { + "epoch": 5.190126751167445, + "loss": 0.7007, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "grad_norm": 1.5291552543640137, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "learning_rate": 0.00039481457278160037, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 1.0346124172210693, + "step": 15560 + }, + { + "ce_loss": 0.1917654573917389, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.24482488632202148, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.13623914122581482, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 0.7478914260864258, + "step": 15560 + }, + { + "ce_loss": 0.19277100265026093, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.2582072913646698, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.1460961103439331, + "step": 15560 + }, + { + "epoch": 5.19346230820547, + "loss": 0.7929, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "grad_norm": 2.785123109817505, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "learning_rate": 0.00039438248116483945, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.7991613149642944, + "step": 15570 + }, + { + "ce_loss": 0.1651901751756668, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.3231179714202881, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.14283576607704163, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.8519611358642578, + "step": 15570 + }, + { + "ce_loss": 0.19507086277008057, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.34214770793914795, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.1515471190214157, + "step": 15570 + }, + { + "epoch": 5.196797865243496, + "loss": 0.7231, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "grad_norm": 1.721127986907959, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "learning_rate": 0.00039395039610432746, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.7653343081474304, + "step": 15580 + }, + { + "ce_loss": 0.18521682918071747, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.33098432421684265, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.13533954322338104, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.6579205393791199, + "step": 15580 + }, + { + "ce_loss": 0.1290563941001892, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.2866734266281128, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.11047664284706116, + "step": 15580 + }, + { + "epoch": 5.200133422281521, + "loss": 0.7133, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "grad_norm": 1.6893715858459473, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "learning_rate": 0.00039351831810435425, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.8286232948303223, + "step": 15590 + }, + { + "ce_loss": 0.14069385826587677, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.27295631170272827, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.16400879621505737, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.5254113674163818, + "step": 15590 + }, + { + "ce_loss": 0.13113446533679962, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.2617083787918091, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.13194455206394196, + "step": 15590 + }, + { + "epoch": 5.203468979319546, + "loss": 0.887, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "grad_norm": 1.6650769710540771, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "learning_rate": 0.00039308624766920113, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 1.0041122436523438, + "step": 15600 + }, + { + "ce_loss": 0.18588700890541077, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.34661564230918884, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.2050558626651764, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 1.0341342687606812, + "step": 15600 + }, + { + "ce_loss": 0.16048888862133026, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.25734442472457886, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.13042587041854858, + "step": 15600 + }, + { + "epoch": 5.206804536357572, + "loss": 0.783, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "grad_norm": 2.852943181991577, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "learning_rate": 0.00039265418530314087, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 1.0115138292312622, + "step": 15610 + }, + { + "ce_loss": 0.17285360395908356, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.4148566424846649, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.11883706599473953, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 0.6742502450942993, + "step": 15610 + }, + { + "ce_loss": 0.13640636205673218, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.35797151923179626, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.1325061023235321, + "step": 15610 + }, + { + "epoch": 5.210140093395597, + "loss": 0.8527, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "grad_norm": 1.5104297399520874, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "learning_rate": 0.0003922221315104369, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.6583712100982666, + "step": 15620 + }, + { + "ce_loss": 0.1715448945760727, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.34650322794914246, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.140151247382164, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.7607604265213013, + "step": 15620 + }, + { + "ce_loss": 0.18406368792057037, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.3910118043422699, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.1506662219762802, + "step": 15620 + }, + { + "epoch": 5.213475650433622, + "loss": 0.8219, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "grad_norm": 1.69031822681427, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "learning_rate": 0.0003917900867953425, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.8839518427848816, + "step": 15630 + }, + { + "ce_loss": 0.16526243090629578, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.43955349922180176, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.1280617117881775, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.6657829284667969, + "step": 15630 + }, + { + "ce_loss": 0.17464981973171234, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.314512699842453, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.14456845819950104, + "step": 15630 + }, + { + "epoch": 5.216811207471648, + "loss": 0.833, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "grad_norm": 3.2884371280670166, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "learning_rate": 0.00039135805166210007, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.9567729234695435, + "step": 15640 + }, + { + "ce_loss": 0.25114181637763977, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.3986857831478119, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.17955972254276276, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.7126868367195129, + "step": 15640 + }, + { + "ce_loss": 0.15249194204807281, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.31977659463882446, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.13722363114356995, + "step": 15640 + }, + { + "epoch": 5.220146764509673, + "loss": 0.8554, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "grad_norm": 5.034350872039795, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "learning_rate": 0.00039092602661494147, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.6116445064544678, + "step": 15650 + }, + { + "ce_loss": 0.11406902223825455, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.24040089547634125, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.12359070777893066, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.6653336882591248, + "step": 15650 + }, + { + "ce_loss": 0.15816573798656464, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.3218991756439209, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.14883488416671753, + "step": 15650 + }, + { + "epoch": 5.223482321547698, + "loss": 0.7961, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "grad_norm": 1.702118992805481, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "learning_rate": 0.000390494012158086, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.6992454528808594, + "step": 15660 + }, + { + "ce_loss": 0.20349298417568207, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.33723607659339905, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.15842409431934357, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.9260455369949341, + "step": 15660 + }, + { + "ce_loss": 0.21057721972465515, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.3818778693675995, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.15959475934505463, + "step": 15660 + }, + { + "epoch": 5.226817878585724, + "loss": 0.7601, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "grad_norm": 2.036332845687866, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "learning_rate": 0.0003900620087957414, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 0.7959449887275696, + "step": 15670 + }, + { + "ce_loss": 0.16416768729686737, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.3899722099304199, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.17624466121196747, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 1.0749465227127075, + "step": 15670 + }, + { + "ce_loss": 0.2088886797428131, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.36911848187446594, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.16031013429164886, + "step": 15670 + }, + { + "epoch": 5.230153435623749, + "loss": 0.8381, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "grad_norm": 1.862441897392273, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "learning_rate": 0.0003896300170321018, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 1.2608473300933838, + "step": 15680 + }, + { + "ce_loss": 0.18141326308250427, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.3208111524581909, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.16460202634334564, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 0.6557343006134033, + "step": 15680 + }, + { + "ce_loss": 0.1695721298456192, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.2742932438850403, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.14762458205223083, + "step": 15680 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.7043, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "grad_norm": 2.2062363624572754, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "learning_rate": 0.00038919803737134825, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.7139572501182556, + "step": 15690 + }, + { + "ce_loss": 0.17304302752017975, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.35036876797676086, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.1352560669183731, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.6848664879798889, + "step": 15690 + }, + { + "ce_loss": 0.15748827159404755, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.2823999226093292, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.13981734216213226, + "step": 15690 + }, + { + "epoch": 5.2368245496998, + "loss": 0.8185, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "grad_norm": 2.144094705581665, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "learning_rate": 0.00038876607031764735, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.7011567950248718, + "step": 15700 + }, + { + "ce_loss": 0.17664001882076263, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.2781679630279541, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.15840114653110504, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.565709114074707, + "step": 15700 + }, + { + "ce_loss": 0.0962267592549324, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.27149227261543274, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.12075101584196091, + "step": 15700 + }, + { + "epoch": 5.240160106737825, + "loss": 0.7272, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "grad_norm": 1.2410707473754883, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "learning_rate": 0.00038833411637515127, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.5946895480155945, + "step": 15710 + }, + { + "ce_loss": 0.13202613592147827, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.2717001736164093, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.12810038030147552, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.7258737683296204, + "step": 15710 + }, + { + "ce_loss": 0.2018779069185257, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.36134767532348633, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.16241465508937836, + "step": 15710 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.7523, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "grad_norm": 2.900282382965088, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "learning_rate": 0.0003879021760479965, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.7594578266143799, + "step": 15720 + }, + { + "ce_loss": 0.20554454624652863, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.2997867465019226, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.1371021717786789, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.4947853088378906, + "step": 15720 + }, + { + "ce_loss": 0.11712466925382614, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.19641387462615967, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.15747201442718506, + "step": 15720 + }, + { + "epoch": 5.246831220813876, + "loss": 0.7997, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "grad_norm": 4.173399448394775, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "learning_rate": 0.0003874702498403042, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.7463613748550415, + "step": 15730 + }, + { + "ce_loss": 0.18147322535514832, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.3750154674053192, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.14311262965202332, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.7426289916038513, + "step": 15730 + }, + { + "ce_loss": 0.15202565491199493, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.2733365297317505, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.11649046093225479, + "step": 15730 + }, + { + "epoch": 5.250166777851901, + "loss": 0.7452, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "grad_norm": 2.6787667274475098, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "learning_rate": 0.0003870383382561787, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.6864969730377197, + "step": 15740 + }, + { + "ce_loss": 0.15401573479175568, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.32028788328170776, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.16295704245567322, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.5780356526374817, + "step": 15740 + }, + { + "ce_loss": 0.13841503858566284, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.28010672330856323, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.12720590829849243, + "step": 15740 + }, + { + "epoch": 5.253502334889927, + "loss": 0.7336, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "grad_norm": 1.7823841571807861, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "learning_rate": 0.00038660644179970707, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.7867616415023804, + "step": 15750 + }, + { + "ce_loss": 0.1609250158071518, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.2993946969509125, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.11262772977352142, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.8699862957000732, + "step": 15750 + }, + { + "ce_loss": 0.10256128758192062, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.2720155417919159, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.15460257232189178, + "step": 15750 + }, + { + "epoch": 5.256837891927952, + "loss": 0.7103, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "grad_norm": 1.9407238960266113, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "learning_rate": 0.0003861745609749591, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.5640208125114441, + "step": 15760 + }, + { + "ce_loss": 0.1139085665345192, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.26098352670669556, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.13402090966701508, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.6253343224525452, + "step": 15760 + }, + { + "ce_loss": 0.16839808225631714, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.2530108690261841, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.11771053820848465, + "step": 15760 + }, + { + "epoch": 5.260173448965977, + "loss": 0.7787, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "grad_norm": 2.1769955158233643, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "learning_rate": 0.0003857426962859861, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.6088996529579163, + "step": 15770 + }, + { + "ce_loss": 0.12888173758983612, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.27795273065567017, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.13567443192005157, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.6457476615905762, + "step": 15770 + }, + { + "ce_loss": 0.1572996973991394, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.23233917355537415, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.13116711378097534, + "step": 15770 + }, + { + "epoch": 5.263509006004003, + "loss": 0.6841, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "grad_norm": 2.4963042736053467, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "learning_rate": 0.00038531084823682077, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.7489142417907715, + "step": 15780 + }, + { + "ce_loss": 0.21276547014713287, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.32808375358581543, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.14841942489147186, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.6570113301277161, + "step": 15780 + }, + { + "ce_loss": 0.16270892322063446, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.3157443106174469, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.12651695311069489, + "step": 15780 + }, + { + "epoch": 5.266844563042028, + "loss": 0.763, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "grad_norm": 1.597322702407837, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "learning_rate": 0.0003848790173314761, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.6330976486206055, + "step": 15790 + }, + { + "ce_loss": 0.13653190433979034, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.3290137052536011, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.12903012335300446, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.5458581447601318, + "step": 15790 + }, + { + "ce_loss": 0.12407498061656952, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.24929481744766235, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.1366756558418274, + "step": 15790 + }, + { + "epoch": 5.270180120080053, + "loss": 0.7372, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "grad_norm": 1.6201483011245728, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "learning_rate": 0.0003844472040739454, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.7160874009132385, + "step": 15800 + }, + { + "ce_loss": 0.1689663529396057, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.35960906744003296, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.1494126170873642, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.7563287019729614, + "step": 15800 + }, + { + "ce_loss": 0.16878221929073334, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.3290070593357086, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.1567942053079605, + "step": 15800 + }, + { + "epoch": 5.273515677118079, + "loss": 0.7504, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "grad_norm": 1.6044793128967285, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "learning_rate": 0.00038401540896820097, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.7505194544792175, + "step": 15810 + }, + { + "ce_loss": 0.21263115108013153, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.3389429450035095, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.14325197041034698, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.7398129105567932, + "step": 15810 + }, + { + "ce_loss": 0.1300848126411438, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.27438250184059143, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.1393304020166397, + "step": 15810 + }, + { + "epoch": 5.276851234156104, + "loss": 0.7488, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "grad_norm": 2.243036985397339, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "learning_rate": 0.0003835836325181943, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.6281885504722595, + "step": 15820 + }, + { + "ce_loss": 0.18166813254356384, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.24070219695568085, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.14517952501773834, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.5907910466194153, + "step": 15820 + }, + { + "ce_loss": 0.18832477927207947, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.24199432134628296, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.1601739078760147, + "step": 15820 + }, + { + "epoch": 5.280186791194129, + "loss": 0.7628, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "grad_norm": 1.7993576526641846, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "learning_rate": 0.00038315187522785485, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.6452166438102722, + "step": 15830 + }, + { + "ce_loss": 0.18044812977313995, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.3071821331977844, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.11665491759777069, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.6112740635871887, + "step": 15830 + }, + { + "ce_loss": 0.19033896923065186, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.25625908374786377, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.1352468729019165, + "step": 15830 + }, + { + "epoch": 5.283522348232155, + "loss": 0.6916, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "grad_norm": 1.3148763179779053, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "learning_rate": 0.0003827201376010901, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.7609184980392456, + "step": 15840 + }, + { + "ce_loss": 0.15719489753246307, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.24489633738994598, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.1509247124195099, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.644400417804718, + "step": 15840 + }, + { + "ce_loss": 0.18509264290332794, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.2854074537754059, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.13598176836967468, + "step": 15840 + }, + { + "epoch": 5.28685790527018, + "loss": 0.6827, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "grad_norm": 1.4376471042633057, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "learning_rate": 0.0003822884201417841, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.7024563550949097, + "step": 15850 + }, + { + "ce_loss": 0.15480999648571014, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.2750566303730011, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.14786669611930847, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.597403883934021, + "step": 15850 + }, + { + "ce_loss": 0.1416284441947937, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.2873379588127136, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.13200509548187256, + "step": 15850 + }, + { + "epoch": 5.290193462308205, + "loss": 0.7826, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "grad_norm": 3.04026198387146, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "learning_rate": 0.00038185672335379773, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 0.6992098093032837, + "step": 15860 + }, + { + "ce_loss": 0.1623932421207428, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.318131685256958, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.12590597569942474, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 1.0851943492889404, + "step": 15860 + }, + { + "ce_loss": 0.1912342607975006, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.31606537103652954, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.17208561301231384, + "step": 15860 + }, + { + "epoch": 5.293529019346231, + "loss": 0.745, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "grad_norm": 1.4783555269241333, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "learning_rate": 0.0003814250477409674, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.6411396265029907, + "step": 15870 + }, + { + "ce_loss": 0.1674439013004303, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.32471901178359985, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.14873762428760529, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.6937785148620605, + "step": 15870 + }, + { + "ce_loss": 0.15537618100643158, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.29007747769355774, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.1800795942544937, + "step": 15870 + }, + { + "epoch": 5.296864576384256, + "loss": 0.6858, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "grad_norm": 1.375457763671875, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "learning_rate": 0.0003809933938071052, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.9765406847000122, + "step": 15880 + }, + { + "ce_loss": 0.14966456592082977, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.27575427293777466, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.18869327008724213, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.7333070635795593, + "step": 15880 + }, + { + "ce_loss": 0.1790359616279602, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.2985996603965759, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.13419148325920105, + "step": 15880 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.7329, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "grad_norm": 4.050812244415283, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "learning_rate": 0.00038056176205599753, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.7350209951400757, + "step": 15890 + }, + { + "ce_loss": 0.19823463261127472, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.3150331974029541, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.16679008305072784, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.8023890256881714, + "step": 15890 + }, + { + "ce_loss": 0.2002089023590088, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.2447521984577179, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.15051716566085815, + "step": 15890 + }, + { + "epoch": 5.303535690460307, + "loss": 0.7442, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "grad_norm": 2.720048189163208, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "learning_rate": 0.0003801301529914053, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 0.6218310594558716, + "step": 15900 + }, + { + "ce_loss": 0.13920356333255768, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.3183743357658386, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.12536291778087616, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 1.0331048965454102, + "step": 15900 + }, + { + "ce_loss": 0.27767038345336914, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.39653193950653076, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.15157125890254974, + "step": 15900 + }, + { + "epoch": 5.306871247498332, + "loss": 0.7917, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "grad_norm": 1.7117061614990234, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "learning_rate": 0.0003796985671170625, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.6310486793518066, + "step": 15910 + }, + { + "ce_loss": 0.15972702205181122, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.293802946805954, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.1284494400024414, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.6181274652481079, + "step": 15910 + }, + { + "ce_loss": 0.15288305282592773, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.2966892719268799, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.12668883800506592, + "step": 15910 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.7613, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "grad_norm": 2.331742286682129, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "learning_rate": 0.0003792670049366765, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.7452472448348999, + "step": 15920 + }, + { + "ce_loss": 0.21786858141422272, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.28816884756088257, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.16267895698547363, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.684940755367279, + "step": 15920 + }, + { + "ce_loss": 0.14751452207565308, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.34391942620277405, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.1931276023387909, + "step": 15920 + }, + { + "epoch": 5.313542361574383, + "loss": 0.7358, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "grad_norm": 4.6383748054504395, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "learning_rate": 0.0003788354669539266, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.7389318943023682, + "step": 15930 + }, + { + "ce_loss": 0.13353165984153748, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.30215421319007874, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.1452709436416626, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.7610844373703003, + "step": 15930 + }, + { + "ce_loss": 0.18171174824237823, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.2951901853084564, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.1556285321712494, + "step": 15930 + }, + { + "epoch": 5.316877918612408, + "loss": 0.7467, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "grad_norm": 2.451345682144165, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "learning_rate": 0.00037840395367246405, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.7295464873313904, + "step": 15940 + }, + { + "ce_loss": 0.18687045574188232, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.3247690200805664, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.17957422137260437, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.7804718017578125, + "step": 15940 + }, + { + "ce_loss": 0.17364463210105896, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.31363144516944885, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.1749681830406189, + "step": 15940 + }, + { + "epoch": 5.320213475650434, + "loss": 0.6915, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "grad_norm": 2.567410707473755, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "learning_rate": 0.0003779724655959116, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.6483530402183533, + "step": 15950 + }, + { + "ce_loss": 0.20535847544670105, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.3006976544857025, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.14195044338703156, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.6408244371414185, + "step": 15950 + }, + { + "ce_loss": 0.14869768917560577, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.32060331106185913, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.13498196005821228, + "step": 15950 + }, + { + "epoch": 5.323549032688459, + "loss": 0.7747, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "grad_norm": 1.5408507585525513, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "learning_rate": 0.000377541003227862, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.6456042528152466, + "step": 15960 + }, + { + "ce_loss": 0.1565854847431183, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.32306361198425293, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.12703146040439606, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.6512212157249451, + "step": 15960 + }, + { + "ce_loss": 0.16830721497535706, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.356458455324173, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.1261129081249237, + "step": 15960 + }, + { + "epoch": 5.326884589726484, + "loss": 0.7416, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "grad_norm": 2.083885669708252, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "learning_rate": 0.00037710956707187826, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.6469038724899292, + "step": 15970 + }, + { + "ce_loss": 0.14937376976013184, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.3656269609928131, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.09814153611660004, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.734693169593811, + "step": 15970 + }, + { + "ce_loss": 0.20170821249485016, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.37536001205444336, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.1566096842288971, + "step": 15970 + }, + { + "epoch": 5.33022014676451, + "loss": 0.6901, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "grad_norm": 1.5245089530944824, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "learning_rate": 0.00037667815763149296, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.9062093496322632, + "step": 15980 + }, + { + "ce_loss": 0.16774944961071014, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.3081806004047394, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.16400887072086334, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.7526729106903076, + "step": 15980 + }, + { + "ce_loss": 0.157914400100708, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.29823794960975647, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.12280679494142532, + "step": 15980 + }, + { + "epoch": 5.333555703802535, + "loss": 0.7697, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "grad_norm": 2.3326122760772705, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "learning_rate": 0.0003762467754102072, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 1.2512686252593994, + "step": 15990 + }, + { + "ce_loss": 0.19256213307380676, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.33866339921951294, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.1713816374540329, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 0.6252297163009644, + "step": 15990 + }, + { + "ce_loss": 0.14829422533512115, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.33098459243774414, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.11103123426437378, + "step": 15990 + }, + { + "epoch": 5.33689126084056, + "loss": 0.8656, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "grad_norm": 1.9075807332992554, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "learning_rate": 0.00037581542091149055, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.6631274819374084, + "step": 16000 + }, + { + "ce_loss": 0.19678767025470734, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.3303927481174469, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.13508301973342896, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.7302857041358948, + "step": 16000 + }, + { + "ce_loss": 0.2101486772298813, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.3147587478160858, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.1732034981250763, + "step": 16000 + }, + { + "epoch": 5.340226817878586, + "loss": 0.7652, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "grad_norm": 1.8792786598205566, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "learning_rate": 0.00037538409463878, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.9005892872810364, + "step": 16010 + }, + { + "ce_loss": 0.20390790700912476, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.36432161927223206, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.1445426493883133, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.5989366173744202, + "step": 16010 + }, + { + "ce_loss": 0.13817287981510162, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.26578107476234436, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.14596757292747498, + "step": 16010 + }, + { + "epoch": 5.343562374916611, + "loss": 0.7972, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "grad_norm": 1.37563955783844, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "learning_rate": 0.0003749527970954798, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.6624598503112793, + "step": 16020 + }, + { + "ce_loss": 0.14707306027412415, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.36443811655044556, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.1507168412208557, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.7892172932624817, + "step": 16020 + }, + { + "ce_loss": 0.2143014669418335, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.37718236446380615, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.19742228090763092, + "step": 16020 + }, + { + "epoch": 5.346897931954636, + "loss": 0.8115, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "grad_norm": 1.8450546264648438, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "learning_rate": 0.0003745215287849606, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.7200436592102051, + "step": 16030 + }, + { + "ce_loss": 0.15626439452171326, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.2960348129272461, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.11136317998170853, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.6884328126907349, + "step": 16030 + }, + { + "ce_loss": 0.1953802853822708, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.347346693277359, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.14511752128601074, + "step": 16030 + }, + { + "epoch": 5.350233488992662, + "loss": 0.7676, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "grad_norm": 2.2440707683563232, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "learning_rate": 0.00037409029021055886, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.6472846269607544, + "step": 16040 + }, + { + "ce_loss": 0.18246594071388245, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.3376293182373047, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.12702175974845886, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.6976692080497742, + "step": 16040 + }, + { + "ce_loss": 0.21113593876361847, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.31901413202285767, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.16698399186134338, + "step": 16040 + }, + { + "epoch": 5.353569046030687, + "loss": 0.7845, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "grad_norm": 1.2216275930404663, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "learning_rate": 0.00037365908187557634, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.8387020230293274, + "step": 16050 + }, + { + "ce_loss": 0.1646944135427475, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.3545537292957306, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.17081841826438904, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.7031180262565613, + "step": 16050 + }, + { + "ce_loss": 0.17327630519866943, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.2943154275417328, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.19791601598262787, + "step": 16050 + }, + { + "epoch": 5.356904603068712, + "loss": 0.7042, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "grad_norm": 2.8248355388641357, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "learning_rate": 0.0003732279042832798, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.7485116124153137, + "step": 16060 + }, + { + "ce_loss": 0.2005242556333542, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.35190409421920776, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.14127810299396515, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.5735911726951599, + "step": 16060 + }, + { + "ce_loss": 0.1520068496465683, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.2645909786224365, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.13216225802898407, + "step": 16060 + }, + { + "epoch": 5.360240160106738, + "loss": 0.7703, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "grad_norm": 2.925449848175049, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "learning_rate": 0.00037279675793689977, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.8611706495285034, + "step": 16070 + }, + { + "ce_loss": 0.1922069638967514, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.33778977394104004, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.12563969194889069, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.7186930775642395, + "step": 16070 + }, + { + "ce_loss": 0.15724870562553406, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.35159173607826233, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.14582069218158722, + "step": 16070 + }, + { + "epoch": 5.363575717144763, + "loss": 0.7821, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "grad_norm": 3.396252155303955, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "learning_rate": 0.0003723656433396304, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 1.0402714014053345, + "step": 16080 + }, + { + "ce_loss": 0.16723550856113434, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.3183455765247345, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.14463065564632416, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 0.8913321495056152, + "step": 16080 + }, + { + "ce_loss": 0.13748161494731903, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.4310514032840729, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.1252446174621582, + "step": 16080 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.8108, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "grad_norm": 1.7849303483963013, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "learning_rate": 0.0003719345609946289, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.7346190810203552, + "step": 16090 + }, + { + "ce_loss": 0.1527981460094452, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.376956582069397, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.14148907363414764, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.8172603249549866, + "step": 16090 + }, + { + "ce_loss": 0.24916993081569672, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.35236555337905884, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.1716679334640503, + "step": 16090 + }, + { + "epoch": 5.370246831220814, + "loss": 0.7323, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "grad_norm": 2.8443517684936523, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "learning_rate": 0.00037150351140501455, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.581108570098877, + "step": 16100 + }, + { + "ce_loss": 0.14519847929477692, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.2797437012195587, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.12386301159858704, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.700649082660675, + "step": 16100 + }, + { + "ce_loss": 0.17296354472637177, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.3582516312599182, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.1374809294939041, + "step": 16100 + }, + { + "epoch": 5.373582388258839, + "loss": 0.8128, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "grad_norm": 3.3821401596069336, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "learning_rate": 0.00037107249507386885, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 0.8478387594223022, + "step": 16110 + }, + { + "ce_loss": 0.20072631537914276, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.3885403573513031, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.15446209907531738, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 1.0306737422943115, + "step": 16110 + }, + { + "ce_loss": 0.2711739242076874, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.35776910185813904, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.16715602576732635, + "step": 16110 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.7892, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "grad_norm": 1.596034049987793, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "learning_rate": 0.00037064151250423404, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.7503730058670044, + "step": 16120 + }, + { + "ce_loss": 0.18941175937652588, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.33014625310897827, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.14595848321914673, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.5979549288749695, + "step": 16120 + }, + { + "ce_loss": 0.16548322141170502, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.29914453625679016, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.13314181566238403, + "step": 16120 + }, + { + "epoch": 5.38025350233489, + "loss": 0.6698, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "grad_norm": 2.2804458141326904, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "learning_rate": 0.00037021056419911337, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.7559417486190796, + "step": 16130 + }, + { + "ce_loss": 0.17035898566246033, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.32906657457351685, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.14381395280361176, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.8120471835136414, + "step": 16130 + }, + { + "ce_loss": 0.14441224932670593, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.3231331706047058, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.13039423525333405, + "step": 16130 + }, + { + "epoch": 5.383589059372915, + "loss": 0.8278, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "grad_norm": 9.378495216369629, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "learning_rate": 0.0003697796506614696, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.8636850714683533, + "step": 16140 + }, + { + "ce_loss": 0.2732938528060913, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.30688318610191345, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.23132197558879852, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.7541180849075317, + "step": 16140 + }, + { + "ce_loss": 0.1787034571170807, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.2874031364917755, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.16801492869853973, + "step": 16140 + }, + { + "epoch": 5.386924616410941, + "loss": 0.7411, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "grad_norm": 1.6044723987579346, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "learning_rate": 0.0003693487723942255, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.7227868437767029, + "step": 16150 + }, + { + "ce_loss": 0.1402229517698288, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.3689223825931549, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.15327772498130798, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.6728941798210144, + "step": 16150 + }, + { + "ce_loss": 0.14626118540763855, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.2888425886631012, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.14756475389003754, + "step": 16150 + }, + { + "epoch": 5.390260173448966, + "loss": 0.767, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "grad_norm": 1.5285331010818481, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "learning_rate": 0.00036891792990026195, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.636301577091217, + "step": 16160 + }, + { + "ce_loss": 0.11532767117023468, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.30249351263046265, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.1128992810845375, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.8006002306938171, + "step": 16160 + }, + { + "ce_loss": 0.16467390954494476, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.3822260797023773, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.13275885581970215, + "step": 16160 + }, + { + "epoch": 5.393595730486991, + "loss": 0.7877, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "grad_norm": 1.431617021560669, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "learning_rate": 0.00036848712368241904, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.8732019662857056, + "step": 16170 + }, + { + "ce_loss": 0.17849218845367432, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.38769400119781494, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.1828514039516449, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.7162034511566162, + "step": 16170 + }, + { + "ce_loss": 0.19900190830230713, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.3482223153114319, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.16874390840530396, + "step": 16170 + }, + { + "epoch": 5.396931287525017, + "loss": 0.7354, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "grad_norm": 1.560686469078064, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "learning_rate": 0.0003680563542434936, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.8284152150154114, + "step": 16180 + }, + { + "ce_loss": 0.24313443899154663, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.3359900116920471, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.18405799567699432, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.811203122138977, + "step": 16180 + }, + { + "ce_loss": 0.22010092437267303, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.35429173707962036, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.1608809381723404, + "step": 16180 + }, + { + "epoch": 5.400266844563042, + "loss": 0.8111, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "grad_norm": 2.740382432937622, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "learning_rate": 0.00036762562208624016, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.6927562355995178, + "step": 16190 + }, + { + "ce_loss": 0.1379556804895401, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.3650135099887848, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.1422877162694931, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.5087493062019348, + "step": 16190 + }, + { + "ce_loss": 0.14438669383525848, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.2298826277256012, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.11260359734296799, + "step": 16190 + }, + { + "epoch": 5.403602401601067, + "loss": 0.723, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "grad_norm": 1.3632830381393433, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "learning_rate": 0.0003671949277133693, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.5387332439422607, + "step": 16200 + }, + { + "ce_loss": 0.09517402946949005, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.2157907485961914, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.16007165610790253, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.7213792204856873, + "step": 16200 + }, + { + "ce_loss": 0.192056804895401, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.33840423822402954, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.15678493678569794, + "step": 16200 + }, + { + "epoch": 5.406937958639093, + "loss": 0.763, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "grad_norm": 1.7042118310928345, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "learning_rate": 0.00036676427162754777, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.8875052332878113, + "step": 16210 + }, + { + "ce_loss": 0.1821518987417221, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.33723828196525574, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.16609439253807068, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.7967724204063416, + "step": 16210 + }, + { + "ce_loss": 0.18657946586608887, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.32966044545173645, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.17224165797233582, + "step": 16210 + }, + { + "epoch": 5.410273515677118, + "loss": 0.7536, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "grad_norm": 1.781567096710205, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "learning_rate": 0.00036633365433139754, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.8034771680831909, + "step": 16220 + }, + { + "ce_loss": 0.23686951398849487, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.3565540909767151, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.18170595169067383, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.6347296833992004, + "step": 16220 + }, + { + "ce_loss": 0.19076235592365265, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.3064838945865631, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.13722343742847443, + "step": 16220 + }, + { + "epoch": 5.413609072715143, + "loss": 0.7438, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "grad_norm": 2.2475061416625977, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "learning_rate": 0.00036590307632749543, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.7417387962341309, + "step": 16230 + }, + { + "ce_loss": 0.1585659682750702, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.2824546694755554, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.14834964275360107, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.6264053583145142, + "step": 16230 + }, + { + "ce_loss": 0.15628458559513092, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.3434602618217468, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.12648874521255493, + "step": 16230 + }, + { + "epoch": 5.416944629753169, + "loss": 0.7557, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "grad_norm": 1.4844996929168701, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "learning_rate": 0.0003654725381183721, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.7612562775611877, + "step": 16240 + }, + { + "ce_loss": 0.2294774055480957, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.3751007914543152, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.15650267899036407, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.7427468299865723, + "step": 16240 + }, + { + "ce_loss": 0.19064797461032867, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.333304762840271, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.13059431314468384, + "step": 16240 + }, + { + "epoch": 5.420280186791194, + "loss": 0.8037, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "grad_norm": 1.3405860662460327, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "learning_rate": 0.00036504204020651227, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.5487708449363708, + "step": 16250 + }, + { + "ce_loss": 0.12229014933109283, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.28631076216697693, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.11592966318130493, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.8611596822738647, + "step": 16250 + }, + { + "ce_loss": 0.18582291901111603, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.34006738662719727, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.20299170911312103, + "step": 16250 + }, + { + "epoch": 5.423615743829219, + "loss": 0.8197, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "grad_norm": 1.984776258468628, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "learning_rate": 0.000364611583094353, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.5243992805480957, + "step": 16260 + }, + { + "ce_loss": 0.1208997592329979, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.275307834148407, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.1278659701347351, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.8565524220466614, + "step": 16260 + }, + { + "ce_loss": 0.23394560813903809, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.36015230417251587, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.17624898254871368, + "step": 16260 + }, + { + "epoch": 5.426951300867245, + "loss": 0.7453, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "grad_norm": 1.8883329629898071, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "learning_rate": 0.0003641811672842842, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.8288264870643616, + "step": 16270 + }, + { + "ce_loss": 0.13347579538822174, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.2436203956604004, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.14556606113910675, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.5858508944511414, + "step": 16270 + }, + { + "ce_loss": 0.16244244575500488, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.23977768421173096, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.14092527329921722, + "step": 16270 + }, + { + "epoch": 5.43028685790527, + "loss": 0.7074, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "grad_norm": 2.5687782764434814, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "learning_rate": 0.0003637507932786475, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.6588826179504395, + "step": 16280 + }, + { + "ce_loss": 0.1949000507593155, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.2663923501968384, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.14771591126918793, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.6191297173500061, + "step": 16280 + }, + { + "ce_loss": 0.13703526556491852, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.295706570148468, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.13173656165599823, + "step": 16280 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.6516, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "grad_norm": 1.6438771486282349, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "learning_rate": 0.0003633204615797356, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.5487070083618164, + "step": 16290 + }, + { + "ce_loss": 0.14527775347232819, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.2327229082584381, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.17044702172279358, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.7046306133270264, + "step": 16290 + }, + { + "ce_loss": 0.21647413074970245, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.24801312386989594, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.19363921880722046, + "step": 16290 + }, + { + "epoch": 5.436957971981321, + "loss": 0.6573, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "grad_norm": 8.01134204864502, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "learning_rate": 0.00036289017268979204, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.5597203969955444, + "step": 16300 + }, + { + "ce_loss": 0.10873627662658691, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.21907083690166473, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.1122971624135971, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.5726298093795776, + "step": 16300 + }, + { + "ce_loss": 0.18682533502578735, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.22446873784065247, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.16113083064556122, + "step": 16300 + }, + { + "epoch": 5.440293529019346, + "loss": 0.62, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "grad_norm": 1.2961703538894653, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "learning_rate": 0.00036245992711100996, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 0.6034284234046936, + "step": 16310 + }, + { + "ce_loss": 0.22241556644439697, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.24138249456882477, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.13920840620994568, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 0.5251420736312866, + "step": 16310 + }, + { + "ce_loss": 0.15426619350910187, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.2309967428445816, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.13962532579898834, + "step": 16310 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.7097, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "grad_norm": 1.5120129585266113, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "learning_rate": 0.0003620297253455326, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.6220839619636536, + "step": 16320 + }, + { + "ce_loss": 0.1753881275653839, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.2504945397377014, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.16376005113124847, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.6079819202423096, + "step": 16320 + }, + { + "ce_loss": 0.13508924841880798, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.2549654543399811, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.11095515638589859, + "step": 16320 + }, + { + "epoch": 5.446964643095397, + "loss": 0.6923, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "grad_norm": 1.3203496932983398, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "learning_rate": 0.00036159956789545136, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.6790783405303955, + "step": 16330 + }, + { + "ce_loss": 0.19250929355621338, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.21748296916484833, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.15178987383842468, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.5082254409790039, + "step": 16330 + }, + { + "ce_loss": 0.15140938758850098, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.21036866307258606, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.1196763813495636, + "step": 16330 + }, + { + "epoch": 5.450300200133422, + "loss": 0.654, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "grad_norm": 1.700995922088623, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "learning_rate": 0.00036116945526280645, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.6487125754356384, + "step": 16340 + }, + { + "ce_loss": 0.19432617723941803, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.21522441506385803, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.14066074788570404, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.708146870136261, + "step": 16340 + }, + { + "ce_loss": 0.22304917871952057, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.2944159209728241, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.1507091075181961, + "step": 16340 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.6855, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "grad_norm": 1.3701978921890259, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "learning_rate": 0.0003607393879495857, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.5639296174049377, + "step": 16350 + }, + { + "ce_loss": 0.14695106446743011, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.22136704623699188, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.15459786355495453, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.5510401129722595, + "step": 16350 + }, + { + "ce_loss": 0.11991811543703079, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.22079601883888245, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.13425542414188385, + "step": 16350 + }, + { + "epoch": 5.456971314209473, + "loss": 0.6971, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "grad_norm": 1.924656629562378, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "learning_rate": 0.00036030936645772377, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.5232378840446472, + "step": 16360 + }, + { + "ce_loss": 0.11680949479341507, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.27530935406684875, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.0966496467590332, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.757350504398346, + "step": 16360 + }, + { + "ce_loss": 0.23436380922794342, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.32329943776130676, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.1995662897825241, + "step": 16360 + }, + { + "epoch": 5.460306871247498, + "loss": 0.7054, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "grad_norm": 1.747829556465149, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "learning_rate": 0.00035987939128910215, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.7242252826690674, + "step": 16370 + }, + { + "ce_loss": 0.24792882800102234, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.2356468290090561, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.19074757397174835, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.7105324864387512, + "step": 16370 + }, + { + "ce_loss": 0.13640131056308746, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.26673826575279236, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.14105232059955597, + "step": 16370 + }, + { + "epoch": 5.463642428285524, + "loss": 0.7779, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "grad_norm": 2.432455539703369, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "learning_rate": 0.00035944946294554786, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.591047465801239, + "step": 16380 + }, + { + "ce_loss": 0.1356351524591446, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.2682700455188751, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.12913872301578522, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.6132073402404785, + "step": 16380 + }, + { + "ce_loss": 0.16914598643779755, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.2821706533432007, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.12958523631095886, + "step": 16380 + }, + { + "epoch": 5.466977985323549, + "loss": 0.7305, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "grad_norm": 2.488482713699341, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "learning_rate": 0.0003590195819288338, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.732840895652771, + "step": 16390 + }, + { + "ce_loss": 0.1996365338563919, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.32097291946411133, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.13700750470161438, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.6898370981216431, + "step": 16390 + }, + { + "ce_loss": 0.15528330206871033, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.30699825286865234, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.14022010564804077, + "step": 16390 + }, + { + "epoch": 5.470313542361574, + "loss": 0.7357, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "grad_norm": 2.764321804046631, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "learning_rate": 0.00035858974874067746, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.7311261296272278, + "step": 16400 + }, + { + "ce_loss": 0.2125311642885208, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.33469152450561523, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.14825886487960815, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.7489011883735657, + "step": 16400 + }, + { + "ce_loss": 0.2545397877693176, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.32159775495529175, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.13286516070365906, + "step": 16400 + }, + { + "epoch": 5.4736490993996, + "loss": 0.7135, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "grad_norm": 1.813016414642334, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "learning_rate": 0.0003581599638827401, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.6902078986167908, + "step": 16410 + }, + { + "ce_loss": 0.17582891881465912, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.3405342698097229, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.13279816508293152, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.6575373411178589, + "step": 16410 + }, + { + "ce_loss": 0.18639127910137177, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.3385080099105835, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.13242416083812714, + "step": 16410 + }, + { + "epoch": 5.476984656437625, + "loss": 0.7883, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "grad_norm": 1.8976466655731201, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "learning_rate": 0.0003577302278566272, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.6805810928344727, + "step": 16420 + }, + { + "ce_loss": 0.15746085345745087, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.30172422528266907, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.16179125010967255, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.9149475693702698, + "step": 16420 + }, + { + "ce_loss": 0.21631371974945068, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.3094479739665985, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.15932220220565796, + "step": 16420 + }, + { + "epoch": 5.48032021347565, + "loss": 0.8182, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "grad_norm": 1.3693673610687256, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "learning_rate": 0.0003573005411638867, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.645654559135437, + "step": 16430 + }, + { + "ce_loss": 0.154694065451622, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.29492273926734924, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.1481200009584427, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.6389703750610352, + "step": 16430 + }, + { + "ce_loss": 0.1725892722606659, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.3293762803077698, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.10531818866729736, + "step": 16430 + }, + { + "epoch": 5.483655770513676, + "loss": 0.7309, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "grad_norm": 2.5965490341186523, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "learning_rate": 0.0003568709043060094, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.9237037301063538, + "step": 16440 + }, + { + "ce_loss": 0.16885775327682495, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.3852352797985077, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.1278907209634781, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.7946254014968872, + "step": 16440 + }, + { + "ce_loss": 0.19258242845535278, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.33972975611686707, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.14921855926513672, + "step": 16440 + }, + { + "epoch": 5.486991327551701, + "loss": 0.765, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "grad_norm": 1.9643694162368774, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "learning_rate": 0.0003564413177844276, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.5418407320976257, + "step": 16450 + }, + { + "ce_loss": 0.12205801159143448, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.2988021969795227, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.12057796865701675, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.6309268474578857, + "step": 16450 + }, + { + "ce_loss": 0.16797548532485962, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.2645770311355591, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.15919606387615204, + "step": 16450 + }, + { + "epoch": 5.490326884589726, + "loss": 0.7653, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "grad_norm": 2.2363979816436768, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "learning_rate": 0.0003560117821005151, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.6473536491394043, + "step": 16460 + }, + { + "ce_loss": 0.12286846339702606, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.23971088230609894, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.1602736860513687, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.5908822417259216, + "step": 16460 + }, + { + "ce_loss": 0.18052250146865845, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.29990532994270325, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.11026539653539658, + "step": 16460 + }, + { + "epoch": 5.493662441627752, + "loss": 0.7457, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "grad_norm": 1.346691370010376, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "learning_rate": 0.00035558229775558615, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.615505039691925, + "step": 16470 + }, + { + "ce_loss": 0.12159193307161331, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.2667520344257355, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.15134981274604797, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.6371005773544312, + "step": 16470 + }, + { + "ce_loss": 0.17626246809959412, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.2783363461494446, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.1304175853729248, + "step": 16470 + }, + { + "epoch": 5.496997998665777, + "loss": 0.7309, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "grad_norm": 1.4785386323928833, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "learning_rate": 0.00035515286525089536, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.7368677854537964, + "step": 16480 + }, + { + "ce_loss": 0.22332510352134705, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.3088325560092926, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.1685892939567566, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.6153769493103027, + "step": 16480 + }, + { + "ce_loss": 0.13858671486377716, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.2944321632385254, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.12610027194023132, + "step": 16480 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.723, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "grad_norm": 5.9627532958984375, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "learning_rate": 0.0003547234850876364, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.7170730233192444, + "step": 16490 + }, + { + "ce_loss": 0.1981697976589203, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.3203241229057312, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.13835738599300385, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.9237845540046692, + "step": 16490 + }, + { + "ce_loss": 0.30550992488861084, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.3872705101966858, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.17537952959537506, + "step": 16490 + }, + { + "epoch": 5.503669112741828, + "loss": 0.7416, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "grad_norm": 1.865121603012085, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "learning_rate": 0.00035429415776694237, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.7768089771270752, + "step": 16500 + }, + { + "ce_loss": 0.1884937435388565, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.3611510396003723, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.14749257266521454, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.7384665608406067, + "step": 16500 + }, + { + "ce_loss": 0.16080470383167267, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.3617389500141144, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.11626183241605759, + "step": 16500 + }, + { + "epoch": 5.507004669779853, + "loss": 0.6953, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "grad_norm": 6.708162784576416, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "learning_rate": 0.0003538648837898844, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.7219257354736328, + "step": 16510 + }, + { + "ce_loss": 0.17056331038475037, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.297270804643631, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.15417173504829407, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.6959221959114075, + "step": 16510 + }, + { + "ce_loss": 0.16128702461719513, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.3540896773338318, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.13030454516410828, + "step": 16510 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.7567, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "grad_norm": 2.4799792766571045, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "learning_rate": 0.0003534356636574714, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.9273051619529724, + "step": 16520 + }, + { + "ce_loss": 0.1862836480140686, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.4356238842010498, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.16171026229858398, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.9347162246704102, + "step": 16520 + }, + { + "ce_loss": 0.22167471051216125, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.35594063997268677, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.13794006407260895, + "step": 16520 + }, + { + "epoch": 5.513675783855904, + "loss": 0.8511, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "grad_norm": 2.2826759815216064, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "learning_rate": 0.0003530064978706494, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.5434016585350037, + "step": 16530 + }, + { + "ce_loss": 0.10969436913728714, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.2416934370994568, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.1079748123884201, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.8625865578651428, + "step": 16530 + }, + { + "ce_loss": 0.17143461108207703, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.29201212525367737, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.1500742882490158, + "step": 16530 + }, + { + "epoch": 5.517011340893929, + "loss": 0.7527, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "grad_norm": 2.8519067764282227, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "learning_rate": 0.0003525773869303012, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.8090465068817139, + "step": 16540 + }, + { + "ce_loss": 0.1982586681842804, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.3481493592262268, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.14651048183441162, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.875690758228302, + "step": 16540 + }, + { + "ce_loss": 0.188467875123024, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.3411063849925995, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.13025033473968506, + "step": 16540 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.811, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "grad_norm": 2.2053282260894775, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "learning_rate": 0.00035214833133724523, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.7356788516044617, + "step": 16550 + }, + { + "ce_loss": 0.19450567662715912, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.31631433963775635, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.1460232138633728, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.6888248324394226, + "step": 16550 + }, + { + "ce_loss": 0.1981515884399414, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.33612701296806335, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.1542927771806717, + "step": 16550 + }, + { + "epoch": 5.52368245496998, + "loss": 0.7646, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "grad_norm": 1.941384196281433, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "learning_rate": 0.0003517193315922358, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 0.7945826053619385, + "step": 16560 + }, + { + "ce_loss": 0.1891850233078003, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.3589797616004944, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.14466382563114166, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 1.0708749294281006, + "step": 16560 + }, + { + "ce_loss": 0.15622949600219727, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.36662739515304565, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.1381683200597763, + "step": 16560 + }, + { + "epoch": 5.527018012008005, + "loss": 0.7938, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "grad_norm": 2.5366055965423584, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "learning_rate": 0.00035129038819596147, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.5957663059234619, + "step": 16570 + }, + { + "ce_loss": 0.17193228006362915, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.29176750779151917, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.13183808326721191, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.7876423001289368, + "step": 16570 + }, + { + "ce_loss": 0.21744805574417114, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.33549293875694275, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.1758352667093277, + "step": 16570 + }, + { + "epoch": 5.530353569046031, + "loss": 0.819, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "grad_norm": 1.6034607887268066, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "learning_rate": 0.00035086150164904555, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.5763473510742188, + "step": 16580 + }, + { + "ce_loss": 0.12128767371177673, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.2356862723827362, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.1255425214767456, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.868805468082428, + "step": 16580 + }, + { + "ce_loss": 0.1575346738100052, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.3505779504776001, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.1724642813205719, + "step": 16580 + }, + { + "epoch": 5.533689126084056, + "loss": 0.8047, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "grad_norm": 2.1715855598449707, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "learning_rate": 0.00035043267245204464, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.5283437371253967, + "step": 16590 + }, + { + "ce_loss": 0.1427266001701355, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.26146018505096436, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.12393170595169067, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.7784868478775024, + "step": 16590 + }, + { + "ce_loss": 0.17701445519924164, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.32144787907600403, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.16988804936408997, + "step": 16590 + }, + { + "epoch": 5.537024683122081, + "loss": 0.6876, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "grad_norm": 1.437143325805664, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "learning_rate": 0.0003500039011054486, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.6373987197875977, + "step": 16600 + }, + { + "ce_loss": 0.14163723587989807, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.3294292390346527, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.11789793521165848, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.957266628742218, + "step": 16600 + }, + { + "ce_loss": 0.11309836059808731, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.26229843497276306, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.13751594722270966, + "step": 16600 + }, + { + "epoch": 5.540360240160107, + "loss": 0.7447, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "grad_norm": 1.6323940753936768, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "learning_rate": 0.00034957518810967993, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.9404606819152832, + "step": 16610 + }, + { + "ce_loss": 0.1382031887769699, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.3189763128757477, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.14445778727531433, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.6262696385383606, + "step": 16610 + }, + { + "ce_loss": 0.13352514803409576, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.33467742800712585, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.11829882860183716, + "step": 16610 + }, + { + "epoch": 5.543695797198132, + "loss": 0.8215, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "grad_norm": 1.5988037586212158, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "learning_rate": 0.00034914653396509257, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.9318430423736572, + "step": 16620 + }, + { + "ce_loss": 0.15590324997901917, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.26491907238960266, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.15649548172950745, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.8442918062210083, + "step": 16620 + }, + { + "ce_loss": 0.22664012014865875, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.38372889161109924, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.178589329123497, + "step": 16620 + }, + { + "epoch": 5.547031354236157, + "loss": 0.7808, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "grad_norm": 2.557018756866455, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "learning_rate": 0.00034871793917197225, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.7209650278091431, + "step": 16630 + }, + { + "ce_loss": 0.18692630529403687, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.3469357192516327, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.15406200289726257, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.7244488000869751, + "step": 16630 + }, + { + "ce_loss": 0.2061254233121872, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.3313891291618347, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.16208119690418243, + "step": 16630 + }, + { + "epoch": 5.550366911274183, + "loss": 0.734, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "grad_norm": 2.5533528327941895, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "learning_rate": 0.00034828940423053495, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 1.1351650953292847, + "step": 16640 + }, + { + "ce_loss": 0.20006652176380157, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.35626906156539917, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.1594037264585495, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 0.7358875274658203, + "step": 16640 + }, + { + "ce_loss": 0.14336541295051575, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.34208860993385315, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.1457943618297577, + "step": 16640 + }, + { + "epoch": 5.553702468312208, + "loss": 0.7556, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "grad_norm": 1.638131856918335, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "learning_rate": 0.00034786092964092736, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.6036756634712219, + "step": 16650 + }, + { + "ce_loss": 0.14441810548305511, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.32643380761146545, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.13198567926883698, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.5919756293296814, + "step": 16650 + }, + { + "ce_loss": 0.14673617482185364, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.3344498574733734, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.10504456609487534, + "step": 16650 + }, + { + "epoch": 5.557038025350233, + "loss": 0.6602, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "grad_norm": 1.5747668743133545, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "learning_rate": 0.00034743251590322515, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.7086818814277649, + "step": 16660 + }, + { + "ce_loss": 0.20601743459701538, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.3181675374507904, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.1258608102798462, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.7951910495758057, + "step": 16660 + }, + { + "ce_loss": 0.1366240233182907, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.26778659224510193, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.12254621833562851, + "step": 16660 + }, + { + "epoch": 5.560373582388259, + "loss": 0.8001, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "grad_norm": 1.6090749502182007, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "learning_rate": 0.00034700416351743347, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.8416956067085266, + "step": 16670 + }, + { + "ce_loss": 0.20460933446884155, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.3494933843612671, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.17335765063762665, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.6869195699691772, + "step": 16670 + }, + { + "ce_loss": 0.16732385754585266, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.30530908703804016, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.16549931466579437, + "step": 16670 + }, + { + "epoch": 5.563709139426284, + "loss": 0.7658, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "grad_norm": 3.558506727218628, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "learning_rate": 0.0003465758729834855, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 2.007232189178467, + "step": 16680 + }, + { + "ce_loss": 0.20884384214878082, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.28895843029022217, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.1980840563774109, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 0.6627374291419983, + "step": 16680 + }, + { + "ce_loss": 0.11687424778938293, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.349597305059433, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.1425715684890747, + "step": 16680 + }, + { + "epoch": 5.567044696464309, + "loss": 0.8259, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "grad_norm": 2.0322229862213135, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "learning_rate": 0.00034614764480124234, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.8495004773139954, + "step": 16690 + }, + { + "ce_loss": 0.21280600130558014, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.40029972791671753, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.14662204682826996, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.7268968820571899, + "step": 16690 + }, + { + "ce_loss": 0.1668766587972641, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.3052133619785309, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.17897547781467438, + "step": 16690 + }, + { + "epoch": 5.570380253502335, + "loss": 0.7709, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "grad_norm": 2.0620107650756836, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "learning_rate": 0.0003457194794704926, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.7075026035308838, + "step": 16700 + }, + { + "ce_loss": 0.17498093843460083, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.26175427436828613, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.14034396409988403, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.8278827667236328, + "step": 16700 + }, + { + "ce_loss": 0.18370160460472107, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.3056749999523163, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.16722241044044495, + "step": 16700 + }, + { + "epoch": 5.57371581054036, + "loss": 0.7071, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "grad_norm": 1.755882740020752, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "learning_rate": 0.000345291377490951, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.7470847368240356, + "step": 16710 + }, + { + "ce_loss": 0.187892347574234, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.350243479013443, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.16287052631378174, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.8079235553741455, + "step": 16710 + }, + { + "ce_loss": 0.18731939792633057, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.34790876507759094, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.14191386103630066, + "step": 16710 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.7708, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "grad_norm": 1.743152141571045, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "learning_rate": 0.0003448633393622588, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.6767538189888, + "step": 16720 + }, + { + "ce_loss": 0.19777166843414307, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.30505162477493286, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.1371420919895172, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.8049007654190063, + "step": 16720 + }, + { + "ce_loss": 0.1970512717962265, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.3510020971298218, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.16732670366764069, + "step": 16720 + }, + { + "epoch": 5.580386924616411, + "loss": 0.7108, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "grad_norm": 1.6241730451583862, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "learning_rate": 0.00034443536558398255, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.7816483974456787, + "step": 16730 + }, + { + "ce_loss": 0.1863582879304886, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.3359223008155823, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.1906632035970688, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.6090654730796814, + "step": 16730 + }, + { + "ce_loss": 0.14092423021793365, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.27845025062561035, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.12028780579566956, + "step": 16730 + }, + { + "epoch": 5.583722481654436, + "loss": 0.754, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "grad_norm": 2.710909128189087, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "learning_rate": 0.0003440074566556137, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.852007269859314, + "step": 16740 + }, + { + "ce_loss": 0.2281089574098587, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.3126354515552521, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.14135898649692535, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.4547881484031677, + "step": 16740 + }, + { + "ce_loss": 0.09218557178974152, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.22557759284973145, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.09807252138853073, + "step": 16740 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.6579, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "grad_norm": 1.5583220720291138, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "learning_rate": 0.000343579613076568, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.9663202166557312, + "step": 16750 + }, + { + "ce_loss": 0.2363245040178299, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.34112247824668884, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.23673084378242493, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.5706340670585632, + "step": 16750 + }, + { + "ce_loss": 0.1683376133441925, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.27087903022766113, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.10743244737386703, + "step": 16750 + }, + { + "epoch": 5.590393595730487, + "loss": 0.7946, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "grad_norm": 1.9442334175109863, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "learning_rate": 0.00034315183534618484, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.5476641654968262, + "step": 16760 + }, + { + "ce_loss": 0.1564968228340149, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.2599179446697235, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.10186018794775009, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.5254256129264832, + "step": 16760 + }, + { + "ce_loss": 0.13415220379829407, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.2226860672235489, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.116874560713768, + "step": 16760 + }, + { + "epoch": 5.593729152768512, + "loss": 0.7478, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "grad_norm": 2.253359317779541, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "learning_rate": 0.00034272412396372707, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.89459627866745, + "step": 16770 + }, + { + "ce_loss": 0.25067758560180664, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.347787469625473, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.17483831942081451, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.5755136609077454, + "step": 16770 + }, + { + "ce_loss": 0.1253049522638321, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.2704051733016968, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.1341908723115921, + "step": 16770 + }, + { + "epoch": 5.597064709806538, + "loss": 0.7443, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "grad_norm": 1.7001421451568604, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "learning_rate": 0.0003422964794283796, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 0.6673210263252258, + "step": 16780 + }, + { + "ce_loss": 0.18451565504074097, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.3047248125076294, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.1554635465145111, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 1.0121006965637207, + "step": 16780 + }, + { + "ce_loss": 0.2369699776172638, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.33116698265075684, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.13422130048274994, + "step": 16780 + }, + { + "epoch": 5.600400266844563, + "loss": 0.7259, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "grad_norm": 1.4096695184707642, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "learning_rate": 0.00034186890223924995, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 0.9694709181785583, + "step": 16790 + }, + { + "ce_loss": 0.22094103693962097, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.35448187589645386, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.16207851469516754, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 1.0303623676300049, + "step": 16790 + }, + { + "ce_loss": 0.18831492960453033, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.2995215058326721, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.18782763183116913, + "step": 16790 + }, + { + "epoch": 5.603735823882588, + "loss": 0.6969, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "grad_norm": 1.4540307521820068, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "learning_rate": 0.00034144139289536647, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.49818724393844604, + "step": 16800 + }, + { + "ce_loss": 0.11839782446622849, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.2139374017715454, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.12370803952217102, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.44672223925590515, + "step": 16800 + }, + { + "ce_loss": 0.13540023565292358, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.2240711748600006, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.08712411671876907, + "step": 16800 + }, + { + "epoch": 5.607071380920614, + "loss": 0.7494, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "grad_norm": 1.7922204732894897, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "learning_rate": 0.0003410139518956787, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.7348718643188477, + "step": 16810 + }, + { + "ce_loss": 0.20842675864696503, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.27669811248779297, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.13902993500232697, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.6885568499565125, + "step": 16810 + }, + { + "ce_loss": 0.2071564942598343, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.2704514265060425, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.15458983182907104, + "step": 16810 + }, + { + "epoch": 5.610406937958639, + "loss": 0.712, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "grad_norm": 1.3382842540740967, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "learning_rate": 0.00034058657973905606, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.7045660614967346, + "step": 16820 + }, + { + "ce_loss": 0.2180887907743454, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.2846103310585022, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.1844034641981125, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.7456596493721008, + "step": 16820 + }, + { + "ce_loss": 0.16406604647636414, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.2776634097099304, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.17183589935302734, + "step": 16820 + }, + { + "epoch": 5.613742494996664, + "loss": 0.7483, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "grad_norm": 1.4651700258255005, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "learning_rate": 0.0003401592769242881, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.6531354784965515, + "step": 16830 + }, + { + "ce_loss": 0.18232855200767517, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.34719792008399963, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.1235121637582779, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.622502326965332, + "step": 16830 + }, + { + "ce_loss": 0.17055979371070862, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.2820592522621155, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.13489072024822235, + "step": 16830 + }, + { + "epoch": 5.61707805203469, + "loss": 0.6715, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "grad_norm": 1.7071387767791748, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "learning_rate": 0.0003397320439500832, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.9534569978713989, + "step": 16840 + }, + { + "ce_loss": 0.18748196959495544, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.3256116509437561, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.13495130836963654, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.5402416586875916, + "step": 16840 + }, + { + "ce_loss": 0.15914657711982727, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.2467021942138672, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.13433469831943512, + "step": 16840 + }, + { + "epoch": 5.620413609072715, + "loss": 0.7054, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "grad_norm": 2.036802053451538, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "learning_rate": 0.00033930488131506803, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 0.7515554428100586, + "step": 16850 + }, + { + "ce_loss": 0.21604153513908386, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.28629425168037415, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.14884991943836212, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 1.2459590435028076, + "step": 16850 + }, + { + "ce_loss": 0.20419827103614807, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.3320619761943817, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.19198916852474213, + "step": 16850 + }, + { + "epoch": 5.62374916611074, + "loss": 0.7191, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "grad_norm": 1.3319770097732544, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "learning_rate": 0.0003388777895177874, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.905265748500824, + "step": 16860 + }, + { + "ce_loss": 0.2206185758113861, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.3126305043697357, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.16376717388629913, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.5250917673110962, + "step": 16860 + }, + { + "ce_loss": 0.12526977062225342, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.23528367280960083, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.11867980659008026, + "step": 16860 + }, + { + "epoch": 5.627084723148766, + "loss": 0.7325, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "grad_norm": 1.498423457145691, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "learning_rate": 0.00033845076905670353, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.6130893230438232, + "step": 16870 + }, + { + "ce_loss": 0.2040909081697464, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.3104015290737152, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.09847946465015411, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.7601680159568787, + "step": 16870 + }, + { + "ce_loss": 0.26276570558547974, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.3020651936531067, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.19520622491836548, + "step": 16870 + }, + { + "epoch": 5.630420280186791, + "loss": 0.739, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "grad_norm": 5.525702476501465, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "learning_rate": 0.0003380238204301951, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.7757127285003662, + "step": 16880 + }, + { + "ce_loss": 0.19095243513584137, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.30943331122398376, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.15145494043827057, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.5341856479644775, + "step": 16880 + }, + { + "ce_loss": 0.13416753709316254, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.2838825583457947, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.11606307327747345, + "step": 16880 + }, + { + "epoch": 5.633755837224816, + "loss": 0.6988, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "grad_norm": 3.330446720123291, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "learning_rate": 0.0003375969441365572, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.6190627217292786, + "step": 16890 + }, + { + "ce_loss": 0.1746228188276291, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.2694947123527527, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.13239260017871857, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.5963748097419739, + "step": 16890 + }, + { + "ce_loss": 0.16726279258728027, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.26655513048171997, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.1624668687582016, + "step": 16890 + }, + { + "epoch": 5.637091394262842, + "loss": 0.7009, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "grad_norm": 1.365567922592163, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "learning_rate": 0.0003371701406740002, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.5843186378479004, + "step": 16900 + }, + { + "ce_loss": 0.133130744099617, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.2553125023841858, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.1419401913881302, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.5544773936271667, + "step": 16900 + }, + { + "ce_loss": 0.16995187103748322, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.2469933032989502, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.13727059960365295, + "step": 16900 + }, + { + "epoch": 5.640426951300867, + "loss": 0.6852, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "grad_norm": 1.9859882593154907, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "learning_rate": 0.0003367434105406499, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.7333486080169678, + "step": 16910 + }, + { + "ce_loss": 0.11421021074056625, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.3491458296775818, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.12321870774030685, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.9772500395774841, + "step": 16910 + }, + { + "ce_loss": 0.2581791579723358, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.38738787174224854, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.18698999285697937, + "step": 16910 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.7663, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "grad_norm": 1.4611917734146118, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "learning_rate": 0.0003363167542345462, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.8015724420547485, + "step": 16920 + }, + { + "ce_loss": 0.24213604629039764, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.3938767910003662, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.13820527493953705, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.9192935228347778, + "step": 16920 + }, + { + "ce_loss": 0.18241621553897858, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.309222012758255, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.16970683634281158, + "step": 16920 + }, + { + "epoch": 5.647098065376918, + "loss": 0.8112, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "grad_norm": 1.9022551774978638, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "learning_rate": 0.0003358901722536427, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.6248596906661987, + "step": 16930 + }, + { + "ce_loss": 0.1475604623556137, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.30214765667915344, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.09551089257001877, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.9147480130195618, + "step": 16930 + }, + { + "ce_loss": 0.23225298523902893, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.4083430767059326, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.20748469233512878, + "step": 16930 + }, + { + "epoch": 5.650433622414943, + "loss": 0.7233, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "grad_norm": 2.352044105529785, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "learning_rate": 0.0003354636650958069, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.8966991901397705, + "step": 16940 + }, + { + "ce_loss": 0.19321158528327942, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.32078060507774353, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.1405438929796219, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.7034854292869568, + "step": 16940 + }, + { + "ce_loss": 0.1795550435781479, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.3291091024875641, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.16134437918663025, + "step": 16940 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.7532, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "grad_norm": 1.9853451251983643, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "learning_rate": 0.0003350372332588183, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.6733770966529846, + "step": 16950 + }, + { + "ce_loss": 0.20213384926319122, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.2750031352043152, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.14148816466331482, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.6604835391044617, + "step": 16950 + }, + { + "ce_loss": 0.18545019626617432, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.3066851794719696, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.1253993958234787, + "step": 16950 + }, + { + "epoch": 5.657104736490994, + "loss": 0.7522, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "grad_norm": 1.2449283599853516, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "learning_rate": 0.0003346108772403688, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.7136119604110718, + "step": 16960 + }, + { + "ce_loss": 0.1602572649717331, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.35394537448883057, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.15927578508853912, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.7708046436309814, + "step": 16960 + }, + { + "ce_loss": 0.198671355843544, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.32375049591064453, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.14237548410892487, + "step": 16960 + }, + { + "epoch": 5.660440293529019, + "loss": 0.8561, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "grad_norm": 2.542083263397217, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "learning_rate": 0.0003341845975380617, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.5686244368553162, + "step": 16970 + }, + { + "ce_loss": 0.15705722570419312, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.24046377837657928, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.13262216746807098, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.6415659189224243, + "step": 16970 + }, + { + "ce_loss": 0.15539200603961945, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.3097907602787018, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.12549258768558502, + "step": 16970 + }, + { + "epoch": 5.663775850567045, + "loss": 0.781, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "grad_norm": 16.903135299682617, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "learning_rate": 0.0003337583946494113, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 1.1284102201461792, + "step": 16980 + }, + { + "ce_loss": 0.2386995404958725, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.3742934763431549, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.17547878623008728, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 0.7611078023910522, + "step": 16980 + }, + { + "ce_loss": 0.21285733580589294, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.400984525680542, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.14690493047237396, + "step": 16980 + }, + { + "epoch": 5.66711140760507, + "loss": 0.7839, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "grad_norm": 2.551992654800415, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "learning_rate": 0.00033333226907184216, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.5933461785316467, + "step": 16990 + }, + { + "ce_loss": 0.17673857510089874, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.2522673010826111, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.136405810713768, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.6428248286247253, + "step": 16990 + }, + { + "ce_loss": 0.18502990901470184, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.2771799564361572, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.1473894864320755, + "step": 16990 + }, + { + "epoch": 5.670446964643095, + "loss": 0.7321, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "grad_norm": 2.8791425228118896, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "learning_rate": 0.00033290622130268885, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.7614845037460327, + "step": 17000 + }, + { + "ce_loss": 0.14597536623477936, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.35557541251182556, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.1483568251132965, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.6029954552650452, + "step": 17000 + }, + { + "ce_loss": 0.12021680921316147, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.3635213077068329, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.11895711719989777, + "step": 17000 + }, + { + "epoch": 5.673782521681121, + "loss": 0.7013, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "grad_norm": 1.236177921295166, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "learning_rate": 0.0003324802518391948, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.6029284000396729, + "step": 17010 + }, + { + "ce_loss": 0.12316080182790756, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.30217641592025757, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.1424601823091507, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.7892953753471375, + "step": 17010 + }, + { + "ce_loss": 0.20078976452350616, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.33275288343429565, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.14471758902072906, + "step": 17010 + }, + { + "epoch": 5.677118078719146, + "loss": 0.7193, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "grad_norm": 1.97862708568573, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "learning_rate": 0.00033205436117851237, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.6347103118896484, + "step": 17020 + }, + { + "ce_loss": 0.11872317641973495, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.2712605595588684, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.15120559930801392, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.6937782764434814, + "step": 17020 + }, + { + "ce_loss": 0.14366665482521057, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.3271183967590332, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.16461552679538727, + "step": 17020 + }, + { + "epoch": 5.680453635757171, + "loss": 0.7533, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "grad_norm": 1.3897510766983032, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "learning_rate": 0.00033162854981770167, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 1.5620931386947632, + "step": 17030 + }, + { + "ce_loss": 0.17498187720775604, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.35835498571395874, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.13596129417419434, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 0.7162748575210571, + "step": 17030 + }, + { + "ce_loss": 0.20580850541591644, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.29947516322135925, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.1639908254146576, + "step": 17030 + }, + { + "epoch": 5.683789192795197, + "loss": 0.7724, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "grad_norm": 1.2456682920455933, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "learning_rate": 0.0003312028182537302, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.620795726776123, + "step": 17040 + }, + { + "ce_loss": 0.20517976582050323, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.29193615913391113, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.12346983700990677, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.9642418622970581, + "step": 17040 + }, + { + "ce_loss": 0.2738221287727356, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.30171912908554077, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.1887759566307068, + "step": 17040 + }, + { + "epoch": 5.687124749833222, + "loss": 0.6817, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "grad_norm": 2.327080488204956, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "learning_rate": 0.0003307771669834729, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.6097831130027771, + "step": 17050 + }, + { + "ce_loss": 0.15700411796569824, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.2313498556613922, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.1254100203514099, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.6765387058258057, + "step": 17050 + }, + { + "ce_loss": 0.21561045944690704, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.2853943109512329, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.14921283721923828, + "step": 17050 + }, + { + "epoch": 5.690460306871247, + "loss": 0.655, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "grad_norm": 1.403415322303772, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "learning_rate": 0.0003303515965037104, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.7461519837379456, + "step": 17060 + }, + { + "ce_loss": 0.16204185783863068, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.2274303436279297, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.11113007366657257, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.6564823389053345, + "step": 17060 + }, + { + "ce_loss": 0.14736226201057434, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.26632723212242126, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.14623504877090454, + "step": 17060 + }, + { + "epoch": 5.693795863909273, + "loss": 0.6932, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "grad_norm": 2.4529449939727783, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "learning_rate": 0.00032992610731112925, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.6349560022354126, + "step": 17070 + }, + { + "ce_loss": 0.1573634296655655, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.31927490234375, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.12076559662818909, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.7947009205818176, + "step": 17070 + }, + { + "ce_loss": 0.20859742164611816, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.30404800176620483, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.15086445212364197, + "step": 17070 + }, + { + "epoch": 5.697131420947298, + "loss": 0.6802, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "grad_norm": 1.5436433553695679, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "learning_rate": 0.0003295006999023212, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.4146399199962616, + "step": 17080 + }, + { + "ce_loss": 0.08536611497402191, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.21758729219436646, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.0844498947262764, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.8940048813819885, + "step": 17080 + }, + { + "ce_loss": 0.2023334503173828, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.2644149661064148, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.17946581542491913, + "step": 17080 + }, + { + "epoch": 5.700466977985323, + "loss": 0.782, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "grad_norm": 4.547679424285889, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "learning_rate": 0.00032907537477378234, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.9368367195129395, + "step": 17090 + }, + { + "ce_loss": 0.22108429670333862, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.4351109266281128, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.1405351758003235, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.6494972705841064, + "step": 17090 + }, + { + "ce_loss": 0.16825975477695465, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.2918960154056549, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.14743435382843018, + "step": 17090 + }, + { + "epoch": 5.703802535023349, + "loss": 0.7289, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "grad_norm": 3.1013355255126953, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "learning_rate": 0.00032865013242191295, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.8688377737998962, + "step": 17100 + }, + { + "ce_loss": 0.17676447331905365, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.3083076477050781, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.12058064341545105, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.9613972902297974, + "step": 17100 + }, + { + "ce_loss": 0.24627932906150818, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.30985212326049805, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.2107589840888977, + "step": 17100 + }, + { + "epoch": 5.707138092061374, + "loss": 0.7592, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "grad_norm": 1.4553122520446777, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "learning_rate": 0.00032822497334301654, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.6339501142501831, + "step": 17110 + }, + { + "ce_loss": 0.15719901025295258, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.254035621881485, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.14738909900188446, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.8309582471847534, + "step": 17110 + }, + { + "ce_loss": 0.2074785679578781, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.32274875044822693, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.17456257343292236, + "step": 17110 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.7441, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "grad_norm": 1.737001895904541, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "learning_rate": 0.00032779989803329967, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.7288815975189209, + "step": 17120 + }, + { + "ce_loss": 0.2120910882949829, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.33151134848594666, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.15215155482292175, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.7121763229370117, + "step": 17120 + }, + { + "ce_loss": 0.1438087373971939, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.2862492799758911, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.14188453555107117, + "step": 17120 + }, + { + "epoch": 5.713809206137425, + "loss": 0.6852, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "grad_norm": 1.4312630891799927, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "learning_rate": 0.0003273749069888707, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.4745047688484192, + "step": 17130 + }, + { + "ce_loss": 0.12595778703689575, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.23172588646411896, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.1163245290517807, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.5332661867141724, + "step": 17130 + }, + { + "ce_loss": 0.1429232954978943, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.2580482065677643, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.1318104863166809, + "step": 17130 + }, + { + "epoch": 5.71714476317545, + "loss": 0.6839, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "grad_norm": 3.277230739593506, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "learning_rate": 0.00032695000070574016, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.7500348091125488, + "step": 17140 + }, + { + "ce_loss": 0.23020581901073456, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.36458221077919006, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.15495723485946655, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.7711657881736755, + "step": 17140 + }, + { + "ce_loss": 0.2473836988210678, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.33617162704467773, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.12509773671627045, + "step": 17140 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.712, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "grad_norm": 2.578519582748413, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "learning_rate": 0.00032652517967981913, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.6146144270896912, + "step": 17150 + }, + { + "ce_loss": 0.14902712404727936, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.2777806520462036, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.14889934659004211, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.795230507850647, + "step": 17150 + }, + { + "ce_loss": 0.2181692123413086, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.2780587077140808, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.20927740633487701, + "step": 17150 + }, + { + "epoch": 5.723815877251501, + "loss": 0.6756, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "grad_norm": 1.8128324747085571, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "learning_rate": 0.00032610044440691975, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.5578626990318298, + "step": 17160 + }, + { + "ce_loss": 0.1543758362531662, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.257249116897583, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.11274212598800659, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.594210147857666, + "step": 17160 + }, + { + "ce_loss": 0.14818356931209564, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.25856828689575195, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.147581085562706, + "step": 17160 + }, + { + "epoch": 5.727151434289526, + "loss": 0.6834, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "grad_norm": 1.5219569206237793, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "learning_rate": 0.0003256757953827537, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.7890151739120483, + "step": 17170 + }, + { + "ce_loss": 0.17950721085071564, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.2605040967464447, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.19137854874134064, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.703957736492157, + "step": 17170 + }, + { + "ce_loss": 0.15624277293682098, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.23440703749656677, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.1445663720369339, + "step": 17170 + }, + { + "epoch": 5.730486991327552, + "loss": 0.7209, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "grad_norm": 1.7926363945007324, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "learning_rate": 0.000325251233102932, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.47458457946777344, + "step": 17180 + }, + { + "ce_loss": 0.10066297650337219, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.23481415212154388, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.08710433542728424, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.8608106970787048, + "step": 17180 + }, + { + "ce_loss": 0.24073518812656403, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.4313311278820038, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.14764869213104248, + "step": 17180 + }, + { + "epoch": 5.733822548365577, + "loss": 0.7745, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "grad_norm": 2.1348068714141846, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "learning_rate": 0.0003248267580629647, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.9487937688827515, + "step": 17190 + }, + { + "ce_loss": 0.24654243886470795, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.3833885192871094, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.19179615378379822, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.5966364741325378, + "step": 17190 + }, + { + "ce_loss": 0.16048943996429443, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.29141372442245483, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.10366712510585785, + "step": 17190 + }, + { + "epoch": 5.737158105403602, + "loss": 0.6736, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "grad_norm": 1.6799206733703613, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "learning_rate": 0.00032440237075825954, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.7083799839019775, + "step": 17200 + }, + { + "ce_loss": 0.13097502291202545, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.27484971284866333, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.08867395669221878, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.6861226558685303, + "step": 17200 + }, + { + "ce_loss": 0.12615366280078888, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.2812725305557251, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.1325719952583313, + "step": 17200 + }, + { + "epoch": 5.740493662441628, + "loss": 0.7134, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "grad_norm": 1.6573398113250732, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "learning_rate": 0.00032397807168412244, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.7408238649368286, + "step": 17210 + }, + { + "ce_loss": 0.14878395199775696, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.2950376272201538, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.11640201508998871, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.7037614583969116, + "step": 17210 + }, + { + "ce_loss": 0.19889037311077118, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.29107096791267395, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.1580825001001358, + "step": 17210 + }, + { + "epoch": 5.743829219479653, + "loss": 0.7708, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "grad_norm": 1.7908594608306885, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "learning_rate": 0.00032355386133575594, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.6606780886650085, + "step": 17220 + }, + { + "ce_loss": 0.2049364149570465, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.23314324021339417, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.16539844870567322, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.7575362920761108, + "step": 17220 + }, + { + "ce_loss": 0.18157583475112915, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.32159459590911865, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.14340363442897797, + "step": 17220 + }, + { + "epoch": 5.747164776517678, + "loss": 0.7087, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "grad_norm": 1.848981499671936, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "learning_rate": 0.0003231297402082592, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.6597747206687927, + "step": 17230 + }, + { + "ce_loss": 0.17482557892799377, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.3133894205093384, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.13527342677116394, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.7282883524894714, + "step": 17230 + }, + { + "ce_loss": 0.15620630979537964, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.37796542048454285, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.1426892727613449, + "step": 17230 + }, + { + "epoch": 5.750500333555704, + "loss": 0.696, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "grad_norm": 1.7952791452407837, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "learning_rate": 0.0003227057087966273, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.6785531640052795, + "step": 17240 + }, + { + "ce_loss": 0.1512996107339859, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.3508850634098053, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.1758999079465866, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.7377484440803528, + "step": 17240 + }, + { + "ce_loss": 0.173048198223114, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.33531084656715393, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.11774720996618271, + "step": 17240 + }, + { + "epoch": 5.753835890593729, + "loss": 0.6903, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "grad_norm": 1.3963509798049927, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "learning_rate": 0.00032228176759575036, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.599527895450592, + "step": 17250 + }, + { + "ce_loss": 0.15814733505249023, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.27153316140174866, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.16702882945537567, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.7268506288528442, + "step": 17250 + }, + { + "ce_loss": 0.1674957573413849, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.33976811170578003, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.11406680941581726, + "step": 17250 + }, + { + "epoch": 5.757171447631754, + "loss": 0.694, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "grad_norm": 1.7198892831802368, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "learning_rate": 0.0003218579171004134, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.6412903666496277, + "step": 17260 + }, + { + "ce_loss": 0.17422105371952057, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.30146506428718567, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.1300029456615448, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.7008032202720642, + "step": 17260 + }, + { + "ce_loss": 0.17944203317165375, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.3309924602508545, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.14378021657466888, + "step": 17260 + }, + { + "epoch": 5.76050700466978, + "loss": 0.687, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "grad_norm": 1.684322714805603, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "learning_rate": 0.0003214341578052958, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.8886874318122864, + "step": 17270 + }, + { + "ce_loss": 0.17103131115436554, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.24941280484199524, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.1687363088130951, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.7393255829811096, + "step": 17270 + }, + { + "ce_loss": 0.1802368462085724, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.28230059146881104, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.14546415209770203, + "step": 17270 + }, + { + "epoch": 5.763842561707805, + "loss": 0.7536, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "grad_norm": 2.5762276649475098, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "learning_rate": 0.0003210104902049699, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.6105315685272217, + "step": 17280 + }, + { + "ce_loss": 0.14770327508449554, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.21739080548286438, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.15710628032684326, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.5762127041816711, + "step": 17280 + }, + { + "ce_loss": 0.14088711142539978, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.25248652696609497, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.1329662948846817, + "step": 17280 + }, + { + "epoch": 5.76717811874583, + "loss": 0.6322, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "grad_norm": 1.545915126800537, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "learning_rate": 0.0003205869147939017, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.5105368494987488, + "step": 17290 + }, + { + "ce_loss": 0.1355862319469452, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.22150439023971558, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.12638546526432037, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.5935248136520386, + "step": 17290 + }, + { + "ce_loss": 0.13656610250473022, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.21289098262786865, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.1504911184310913, + "step": 17290 + }, + { + "epoch": 5.770513675783856, + "loss": 0.6362, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "grad_norm": 2.608363151550293, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "learning_rate": 0.00032016343206644907, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.6189454197883606, + "step": 17300 + }, + { + "ce_loss": 0.15753212571144104, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.2742728292942047, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.1473052054643631, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.6348935961723328, + "step": 17300 + }, + { + "ce_loss": 0.17919529974460602, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.2727394104003906, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.1823030263185501, + "step": 17300 + }, + { + "epoch": 5.773849232821881, + "loss": 0.6421, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "grad_norm": 1.9108775854110718, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "learning_rate": 0.00031974004251686205, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.5485625267028809, + "step": 17310 + }, + { + "ce_loss": 0.1477338820695877, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.24226287007331848, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.1192854717373848, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.480712890625, + "step": 17310 + }, + { + "ce_loss": 0.12280880659818649, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.2515189051628113, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.10570239275693893, + "step": 17310 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.6183, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "grad_norm": 1.7842615842819214, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "learning_rate": 0.00031931674663928164, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.5594913959503174, + "step": 17320 + }, + { + "ce_loss": 0.11906707286834717, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.2082429677248001, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.1027420163154602, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.554329514503479, + "step": 17320 + }, + { + "ce_loss": 0.14408791065216064, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.22791585326194763, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.1325961947441101, + "step": 17320 + }, + { + "epoch": 5.780520346897932, + "loss": 0.6579, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "grad_norm": 3.7666094303131104, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "learning_rate": 0.00031889354492773987, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.8298982381820679, + "step": 17330 + }, + { + "ce_loss": 0.19762609899044037, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.28875911235809326, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.1450854390859604, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.626959502696991, + "step": 17330 + }, + { + "ce_loss": 0.15268436074256897, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.29340776801109314, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.13883355259895325, + "step": 17330 + }, + { + "epoch": 5.783855903935957, + "loss": 0.731, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "grad_norm": 1.4649722576141357, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "learning_rate": 0.0003184704378761585, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.8531560301780701, + "step": 17340 + }, + { + "ce_loss": 0.1586577296257019, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.31400057673454285, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.12301208823919296, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.8586217761039734, + "step": 17340 + }, + { + "ce_loss": 0.2559804320335388, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.3865933418273926, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.17483940720558167, + "step": 17340 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.7778, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "grad_norm": 1.4831793308258057, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "learning_rate": 0.0003180474259783492, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.7598646879196167, + "step": 17350 + }, + { + "ce_loss": 0.19547435641288757, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.3320785164833069, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.18037301301956177, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.6318597197532654, + "step": 17350 + }, + { + "ce_loss": 0.17297153174877167, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.26058369874954224, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.15463228523731232, + "step": 17350 + }, + { + "epoch": 5.790527018012008, + "loss": 0.6925, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "grad_norm": 2.5347464084625244, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "learning_rate": 0.00031762450972801215, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.7156696319580078, + "step": 17360 + }, + { + "ce_loss": 0.16093580424785614, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.3110997676849365, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.1486850380897522, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.6132699847221375, + "step": 17360 + }, + { + "ce_loss": 0.12380540370941162, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.262518048286438, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.12827558815479279, + "step": 17360 + }, + { + "epoch": 5.793862575050033, + "loss": 0.6978, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "grad_norm": 1.4433140754699707, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "learning_rate": 0.0003172016896187361, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.7008747458457947, + "step": 17370 + }, + { + "ce_loss": 0.13500070571899414, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.27394983172416687, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.12862640619277954, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.5868788361549377, + "step": 17370 + }, + { + "ce_loss": 0.15697240829467773, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.26224878430366516, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.14263705909252167, + "step": 17370 + }, + { + "epoch": 5.797198132088059, + "loss": 0.6766, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "grad_norm": 1.6639288663864136, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "learning_rate": 0.00031677896614399796, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 1.1503751277923584, + "step": 17380 + }, + { + "ce_loss": 0.24827058613300323, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.39232802391052246, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.2320650964975357, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 0.6632513999938965, + "step": 17380 + }, + { + "ce_loss": 0.1472640186548233, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.32942354679107666, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.14712239801883698, + "step": 17380 + }, + { + "epoch": 5.800533689126084, + "loss": 0.7225, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "grad_norm": 1.553830623626709, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "learning_rate": 0.0003163563397971611, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.5071688294410706, + "step": 17390 + }, + { + "ce_loss": 0.1248534694314003, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.2655135989189148, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.1165119931101799, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.7314570546150208, + "step": 17390 + }, + { + "ce_loss": 0.20705215632915497, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.272818922996521, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.1565602719783783, + "step": 17390 + }, + { + "epoch": 5.803869246164109, + "loss": 0.7273, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "grad_norm": 2.529906988143921, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "learning_rate": 0.0003159338110714762, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.638818085193634, + "step": 17400 + }, + { + "ce_loss": 0.1544320285320282, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.33618006110191345, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.14788387715816498, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.7675939202308655, + "step": 17400 + }, + { + "ce_loss": 0.1987285017967224, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.3170737326145172, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.15776485204696655, + "step": 17400 + }, + { + "epoch": 5.807204803202135, + "loss": 0.703, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "grad_norm": 1.9354774951934814, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "learning_rate": 0.0003155113804600797, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.8200332522392273, + "step": 17410 + }, + { + "ce_loss": 0.21878603100776672, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.3642941117286682, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.15945042669773102, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.9217702150344849, + "step": 17410 + }, + { + "ce_loss": 0.2335110753774643, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.36487433314323425, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.20070920884609222, + "step": 17410 + }, + { + "epoch": 5.81054036024016, + "loss": 0.7387, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "grad_norm": 1.5643854141235352, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "learning_rate": 0.00031508904845599356, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.7375587224960327, + "step": 17420 + }, + { + "ce_loss": 0.20015650987625122, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.32561731338500977, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.15708674490451813, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.8183858394622803, + "step": 17420 + }, + { + "ce_loss": 0.2078881412744522, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.29002147912979126, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.12911447882652283, + "step": 17420 + }, + { + "epoch": 5.813875917278185, + "loss": 0.7037, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "grad_norm": 1.4947174787521362, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "learning_rate": 0.0003146668155521247, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.4583217203617096, + "step": 17430 + }, + { + "ce_loss": 0.0980353131890297, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.20246006548404694, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.10573271661996841, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.6630755066871643, + "step": 17430 + }, + { + "ce_loss": 0.1648399978876114, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.2990220785140991, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.1747708022594452, + "step": 17430 + }, + { + "epoch": 5.817211474316211, + "loss": 0.6581, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "grad_norm": 2.1349339485168457, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "learning_rate": 0.0003142446822412643, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.8816218376159668, + "step": 17440 + }, + { + "ce_loss": 0.24884732067584991, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.3589048385620117, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.17393577098846436, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.7268455028533936, + "step": 17440 + }, + { + "ce_loss": 0.19337479770183563, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.335417240858078, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.1636698693037033, + "step": 17440 + }, + { + "epoch": 5.820547031354236, + "loss": 0.7032, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "grad_norm": 3.8691024780273438, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "learning_rate": 0.00031382264901608735, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.8245714902877808, + "step": 17450 + }, + { + "ce_loss": 0.20893444120883942, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.3680214285850525, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.1830359250307083, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.5935423374176025, + "step": 17450 + }, + { + "ce_loss": 0.15983273088932037, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.18705280125141144, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.14525040984153748, + "step": 17450 + }, + { + "epoch": 5.823882588392261, + "loss": 0.6841, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "grad_norm": 2.8735129833221436, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "learning_rate": 0.00031340071636915207, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.5384118556976318, + "step": 17460 + }, + { + "ce_loss": 0.18613983690738678, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.2151462584733963, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.1369083672761917, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.6384410262107849, + "step": 17460 + }, + { + "ce_loss": 0.138434499502182, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.2718351185321808, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.13998080790042877, + "step": 17460 + }, + { + "epoch": 5.827218145430287, + "loss": 0.6842, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "grad_norm": 2.4372873306274414, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "learning_rate": 0.00031297888479289926, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.7715685367584229, + "step": 17470 + }, + { + "ce_loss": 0.21597355604171753, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.30613934993743896, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.1433599293231964, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.6983997821807861, + "step": 17470 + }, + { + "ce_loss": 0.21408066153526306, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.28993719816207886, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.14629687368869781, + "step": 17470 + }, + { + "epoch": 5.830553702468312, + "loss": 0.7288, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "grad_norm": 2.8923749923706055, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "learning_rate": 0.00031255715477965164, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.8305673599243164, + "step": 17480 + }, + { + "ce_loss": 0.18007422983646393, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.28398704528808594, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.18858270347118378, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.7498915791511536, + "step": 17480 + }, + { + "ce_loss": 0.15216076374053955, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.38887834548950195, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.14917370676994324, + "step": 17480 + }, + { + "epoch": 5.833889259506337, + "loss": 0.7539, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "grad_norm": 3.5626888275146484, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "learning_rate": 0.0003121355268216137, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.7058088779449463, + "step": 17490 + }, + { + "ce_loss": 0.14639702439308167, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.41465479135513306, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.14412228763103485, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.7046496868133545, + "step": 17490 + }, + { + "ce_loss": 0.15008413791656494, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.30911529064178467, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.1350952535867691, + "step": 17490 + }, + { + "epoch": 5.837224816544363, + "loss": 0.7263, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "grad_norm": 2.2108681201934814, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "learning_rate": 0.0003117140014108707, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 0.7081411480903625, + "step": 17500 + }, + { + "ce_loss": 0.16158966720104218, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.31263870000839233, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.12841305136680603, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 1.1486976146697998, + "step": 17500 + }, + { + "ce_loss": 0.20025956630706787, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.34252017736434937, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.21556495130062103, + "step": 17500 + }, + { + "epoch": 5.840560373582388, + "loss": 0.7424, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "grad_norm": 2.667424201965332, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "learning_rate": 0.00031129257903938785, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.779916524887085, + "step": 17510 + }, + { + "ce_loss": 0.2496323585510254, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.3321917653083801, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.12754826247692108, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.7531536221504211, + "step": 17510 + }, + { + "ce_loss": 0.1766018122434616, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.3808102309703827, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.1379728615283966, + "step": 17510 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.7667, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "grad_norm": 1.9223077297210693, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "learning_rate": 0.0003108712601990107, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.797112226486206, + "step": 17520 + }, + { + "ce_loss": 0.21410100162029266, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.29613161087036133, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.1464604139328003, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.7728590965270996, + "step": 17520 + }, + { + "ce_loss": 0.13434790074825287, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.2363002598285675, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.1437375843524933, + "step": 17520 + }, + { + "epoch": 5.847231487658439, + "loss": 0.7348, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "grad_norm": 2.909083604812622, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "learning_rate": 0.0003104500453814635, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.7808569669723511, + "step": 17530 + }, + { + "ce_loss": 0.15024033188819885, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.32764944434165955, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.14453238248825073, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.6624705195426941, + "step": 17530 + }, + { + "ce_loss": 0.1971026360988617, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.276828408241272, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.14170217514038086, + "step": 17530 + }, + { + "epoch": 5.850567044696464, + "loss": 0.7462, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "grad_norm": 1.7439287900924683, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "learning_rate": 0.00031002893507834934, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.7160279750823975, + "step": 17540 + }, + { + "ce_loss": 0.15651878714561462, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.289535254240036, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.14303657412528992, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.739136278629303, + "step": 17540 + }, + { + "ce_loss": 0.18893566727638245, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.34339919686317444, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.16350984573364258, + "step": 17540 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.8228, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "grad_norm": 2.1306843757629395, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "learning_rate": 0.0003096079297811492, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.7175245881080627, + "step": 17550 + }, + { + "ce_loss": 0.17793938517570496, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.33758652210235596, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.14965596795082092, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.904224157333374, + "step": 17550 + }, + { + "ce_loss": 0.22401635348796844, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.34708863496780396, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.18091994524002075, + "step": 17550 + }, + { + "epoch": 5.857238158772515, + "loss": 0.7586, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "grad_norm": 1.8932132720947266, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "learning_rate": 0.00030918702998122165, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.6691727638244629, + "step": 17560 + }, + { + "ce_loss": 0.179330974817276, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.25679606199264526, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.13041779398918152, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.7127830386161804, + "step": 17560 + }, + { + "ce_loss": 0.1948421448469162, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.29180899262428284, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.16397015750408173, + "step": 17560 + }, + { + "epoch": 5.86057371581054, + "loss": 0.7173, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "grad_norm": 7.5867180824279785, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "learning_rate": 0.0003087662361698019, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.6970691084861755, + "step": 17570 + }, + { + "ce_loss": 0.1313159167766571, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.24335519969463348, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.15353406965732574, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.7506760358810425, + "step": 17570 + }, + { + "ce_loss": 0.18178677558898926, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.23799774050712585, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.1689431518316269, + "step": 17570 + }, + { + "epoch": 5.863909272848566, + "loss": 0.7146, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "grad_norm": 4.501378536224365, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "learning_rate": 0.00030834554883800176, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.7862856984138489, + "step": 17580 + }, + { + "ce_loss": 0.2624962329864502, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.283257395029068, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.19673359394073486, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.6927284598350525, + "step": 17580 + }, + { + "ce_loss": 0.15992389619350433, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.32113510370254517, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.15669889748096466, + "step": 17580 + }, + { + "epoch": 5.867244829886591, + "loss": 0.7183, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "grad_norm": 2.619765520095825, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "learning_rate": 0.00030792496847680835, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.6546903848648071, + "step": 17590 + }, + { + "ce_loss": 0.16821420192718506, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.2883762717247009, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.11920984834432602, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.5550947785377502, + "step": 17590 + }, + { + "ce_loss": 0.19670464098453522, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.23802609741687775, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.11984530091285706, + "step": 17590 + }, + { + "epoch": 5.870580386924616, + "loss": 0.7165, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "grad_norm": 2.4450623989105225, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "learning_rate": 0.0003075044955770847, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 1.1171493530273438, + "step": 17600 + }, + { + "ce_loss": 0.21308808028697968, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.3068980872631073, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.16722099483013153, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 0.7828159332275391, + "step": 17600 + }, + { + "ce_loss": 0.1695491075515747, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.29257312417030334, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.1478341817855835, + "step": 17600 + }, + { + "epoch": 5.873915943962642, + "loss": 0.6831, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "grad_norm": 1.61392080783844, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "learning_rate": 0.0003070841306295675, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.671332061290741, + "step": 17610 + }, + { + "ce_loss": 0.15654361248016357, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.24826830625534058, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.12748834490776062, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.7003882527351379, + "step": 17610 + }, + { + "ce_loss": 0.23483048379421234, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.2979724407196045, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.1305704265832901, + "step": 17610 + }, + { + "epoch": 5.877251501000667, + "loss": 0.6819, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "grad_norm": 1.6848235130310059, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "learning_rate": 0.00030666387412486807, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.9243027567863464, + "step": 17620 + }, + { + "ce_loss": 0.2511441111564636, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.32854366302490234, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.22302629053592682, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.8413571119308472, + "step": 17620 + }, + { + "ce_loss": 0.1491873562335968, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.2715878486633301, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.11943260580301285, + "step": 17620 + }, + { + "epoch": 5.880587058038692, + "loss": 0.737, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "grad_norm": 1.6482051610946655, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "learning_rate": 0.00030624372655347086, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.6240556240081787, + "step": 17630 + }, + { + "ce_loss": 0.173484206199646, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.26367682218551636, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.13847512006759644, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.5382674336433411, + "step": 17630 + }, + { + "ce_loss": 0.11168865114450455, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.2589222192764282, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.12466815859079361, + "step": 17630 + }, + { + "epoch": 5.883922615076718, + "loss": 0.7198, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "grad_norm": 1.565881609916687, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "learning_rate": 0.00030582368840573345, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.8074979782104492, + "step": 17640 + }, + { + "ce_loss": 0.19353412091732025, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.2784769535064697, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.1902574896812439, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.6128374338150024, + "step": 17640 + }, + { + "ce_loss": 0.15676084160804749, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.3117706775665283, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.14417685568332672, + "step": 17640 + }, + { + "epoch": 5.887258172114743, + "loss": 0.7086, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "grad_norm": 3.2013261318206787, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "learning_rate": 0.0003054037601718854, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.8468448519706726, + "step": 17650 + }, + { + "ce_loss": 0.175635427236557, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.2721385359764099, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.15196819603443146, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.9644908905029297, + "step": 17650 + }, + { + "ce_loss": 0.20255514979362488, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.33900919556617737, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.15699605643749237, + "step": 17650 + }, + { + "epoch": 5.890593729152768, + "loss": 0.7547, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "grad_norm": 1.6605432033538818, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "learning_rate": 0.00030498394234202824, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.8745073080062866, + "step": 17660 + }, + { + "ce_loss": 0.1588677167892456, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.3496779501438141, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.19170548021793365, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.8664857745170593, + "step": 17660 + }, + { + "ce_loss": 0.17014363408088684, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.3439854383468628, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.1484474539756775, + "step": 17660 + }, + { + "epoch": 5.893929286190794, + "loss": 0.7478, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "grad_norm": 1.7497375011444092, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "learning_rate": 0.0003045642354061345, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.7250229120254517, + "step": 17670 + }, + { + "ce_loss": 0.18709981441497803, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.39688920974731445, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.11318523436784744, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.6799331903457642, + "step": 17670 + }, + { + "ce_loss": 0.1577003002166748, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.31221145391464233, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.13887915015220642, + "step": 17670 + }, + { + "epoch": 5.897264843228819, + "loss": 0.7058, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "grad_norm": 1.4480282068252563, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "learning_rate": 0.00030414463985404736, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 0.9352344870567322, + "step": 17680 + }, + { + "ce_loss": 0.2799140512943268, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.4020175039768219, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.1616601049900055, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 0.8874188661575317, + "step": 17680 + }, + { + "ce_loss": 0.19736677408218384, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.3230544924736023, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.10736200213432312, + "step": 17680 + }, + { + "epoch": 5.900600400266844, + "loss": 0.7224, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "grad_norm": 1.8025034666061401, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "learning_rate": 0.0003037251561754799, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.6984555721282959, + "step": 17690 + }, + { + "ce_loss": 0.1678963005542755, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.28823724389076233, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.1307564228773117, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.6367653608322144, + "step": 17690 + }, + { + "ce_loss": 0.15495973825454712, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.2730174660682678, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.1531836986541748, + "step": 17690 + }, + { + "epoch": 5.90393595730487, + "loss": 0.7038, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "grad_norm": 1.8348549604415894, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "learning_rate": 0.00030330578486001473, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.6442775130271912, + "step": 17700 + }, + { + "ce_loss": 0.19366510212421417, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.26765328645706177, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.11750882118940353, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.7802560329437256, + "step": 17700 + }, + { + "ce_loss": 0.2070993334054947, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.35392361879348755, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.17282815277576447, + "step": 17700 + }, + { + "epoch": 5.907271514342895, + "loss": 0.6966, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "grad_norm": 1.3651440143585205, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "learning_rate": 0.00030288652639710357, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.6090908646583557, + "step": 17710 + }, + { + "ce_loss": 0.1301243156194687, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.2743867039680481, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.09706619381904602, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.6999118328094482, + "step": 17710 + }, + { + "ce_loss": 0.20652586221694946, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.3514205813407898, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.14169923961162567, + "step": 17710 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.6978, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "grad_norm": 1.9332634210586548, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "learning_rate": 0.0003024673812760658, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.6061062216758728, + "step": 17720 + }, + { + "ce_loss": 0.18262982368469238, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.2813739776611328, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.14188861846923828, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.7180115580558777, + "step": 17720 + }, + { + "ce_loss": 0.20352500677108765, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.2630084156990051, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.13348890841007233, + "step": 17720 + }, + { + "epoch": 5.913942628418946, + "loss": 0.7463, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "grad_norm": 1.853853464126587, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "learning_rate": 0.0003020483499860891, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.980445384979248, + "step": 17730 + }, + { + "ce_loss": 0.13897709548473358, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.2369052916765213, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.10596528649330139, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.5980679392814636, + "step": 17730 + }, + { + "ce_loss": 0.16462469100952148, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.22286882996559143, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.11621631681919098, + "step": 17730 + }, + { + "epoch": 5.917278185456971, + "loss": 0.6909, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "grad_norm": 1.5366535186767578, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "learning_rate": 0.00030162943301622794, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.7122682332992554, + "step": 17740 + }, + { + "ce_loss": 0.14937813580036163, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.2587045133113861, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.13506940007209778, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.7699155807495117, + "step": 17740 + }, + { + "ce_loss": 0.19589021801948547, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.3217250108718872, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.15825651586055756, + "step": 17740 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.7326, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "grad_norm": 2.868001699447632, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "learning_rate": 0.0003012106308554036, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.6667677164077759, + "step": 17750 + }, + { + "ce_loss": 0.16533072292804718, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.3047916889190674, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.11249995231628418, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.8356540203094482, + "step": 17750 + }, + { + "ce_loss": 0.1387956738471985, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.28860345482826233, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.11070244014263153, + "step": 17750 + }, + { + "epoch": 5.923949299533022, + "loss": 0.7566, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "grad_norm": 1.771466612815857, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "learning_rate": 0.00030079194399240325, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.8782122135162354, + "step": 17760 + }, + { + "ce_loss": 0.2122136503458023, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.33710652589797974, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.18948283791542053, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.9718055725097656, + "step": 17760 + }, + { + "ce_loss": 0.17462380230426788, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.35814768075942993, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.16498197615146637, + "step": 17760 + }, + { + "epoch": 5.927284856571047, + "loss": 0.7445, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "grad_norm": 2.248972177505493, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "learning_rate": 0.00030037337291587943, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.6510271430015564, + "step": 17770 + }, + { + "ce_loss": 0.17040914297103882, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.31474000215530396, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.13253526389598846, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.9578293561935425, + "step": 17770 + }, + { + "ce_loss": 0.20671266317367554, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.3458459973335266, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.15940354764461517, + "step": 17770 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.7793, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "grad_norm": 2.1776537895202637, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "learning_rate": 0.00029995491811434975, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.7826830744743347, + "step": 17780 + }, + { + "ce_loss": 0.17325259745121002, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.3214309811592102, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.16599765419960022, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.8357514142990112, + "step": 17780 + }, + { + "ce_loss": 0.2053692638874054, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.3940480947494507, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.1913134604692459, + "step": 17780 + }, + { + "epoch": 5.933955970647098, + "loss": 0.8344, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "grad_norm": 4.898279190063477, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "learning_rate": 0.0002995365800761959, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.6484854817390442, + "step": 17790 + }, + { + "ce_loss": 0.14094069600105286, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.3030979335308075, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.1547994613647461, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.7239052653312683, + "step": 17790 + }, + { + "ce_loss": 0.20402802526950836, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.3449753224849701, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.14525295794010162, + "step": 17790 + }, + { + "epoch": 5.937291527685123, + "loss": 0.6571, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "grad_norm": 1.4168012142181396, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "learning_rate": 0.00029911835928966347, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 0.5196230411529541, + "step": 17800 + }, + { + "ce_loss": 0.10742348432540894, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.22581657767295837, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.1268647164106369, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 1.5188542604446411, + "step": 17800 + }, + { + "ce_loss": 0.25732168555259705, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.3840293288230896, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.15666557848453522, + "step": 17800 + }, + { + "epoch": 5.940627084723149, + "loss": 0.7761, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "grad_norm": 2.4067792892456055, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "learning_rate": 0.0002987002562428608, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 0.6301429867744446, + "step": 17810 + }, + { + "ce_loss": 0.17194992303848267, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.28008875250816345, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.1431414633989334, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 1.343503713607788, + "step": 17810 + }, + { + "ce_loss": 0.24207206070423126, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.3009220063686371, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.16927200555801392, + "step": 17810 + }, + { + "epoch": 5.943962641761174, + "loss": 0.8039, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "grad_norm": 1.7773637771606445, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "learning_rate": 0.0002982822714237596, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.7795100808143616, + "step": 17820 + }, + { + "ce_loss": 0.1566115766763687, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.3505699634552002, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.13496427237987518, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.7355562448501587, + "step": 17820 + }, + { + "ce_loss": 0.1443626433610916, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.2736237943172455, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.12582212686538696, + "step": 17820 + }, + { + "epoch": 5.947298198799199, + "loss": 0.7121, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "grad_norm": 1.619713306427002, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "learning_rate": 0.000297864405320193, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.6389651298522949, + "step": 17830 + }, + { + "ce_loss": 0.1558423936367035, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.2980443239212036, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.1452484279870987, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.6044831275939941, + "step": 17830 + }, + { + "ce_loss": 0.19478633999824524, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.277583509683609, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.13188131153583527, + "step": 17830 + }, + { + "epoch": 5.950633755837225, + "loss": 0.7095, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "grad_norm": 2.5467517375946045, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "learning_rate": 0.0002974466584198555, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.4841284453868866, + "step": 17840 + }, + { + "ce_loss": 0.13817235827445984, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.19160957634449005, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.12183627486228943, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.6094743013381958, + "step": 17840 + }, + { + "ce_loss": 0.126152902841568, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.18613648414611816, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.13332106173038483, + "step": 17840 + }, + { + "epoch": 5.95396931287525, + "loss": 0.7089, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "grad_norm": 1.8888506889343262, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "learning_rate": 0.00029702903121030293, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.8191925287246704, + "step": 17850 + }, + { + "ce_loss": 0.20863190293312073, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.2998040020465851, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.14137405157089233, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.740609884262085, + "step": 17850 + }, + { + "ce_loss": 0.1715041548013687, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.26939496397972107, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.17530490458011627, + "step": 17850 + }, + { + "epoch": 5.957304869913275, + "loss": 0.7317, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "grad_norm": 1.600746750831604, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "learning_rate": 0.00029661152417895096, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.5815998315811157, + "step": 17860 + }, + { + "ce_loss": 0.1620999574661255, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.25247102975845337, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.14225803315639496, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.635635256767273, + "step": 17860 + }, + { + "ce_loss": 0.200776144862175, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.27792149782180786, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.15670597553253174, + "step": 17860 + }, + { + "epoch": 5.960640426951301, + "loss": 0.6967, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "grad_norm": 2.040088653564453, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "learning_rate": 0.00029619413781307546, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.8797625303268433, + "step": 17870 + }, + { + "ce_loss": 0.24724845588207245, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.33950746059417725, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.16486626863479614, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.7061132192611694, + "step": 17870 + }, + { + "ce_loss": 0.16689732670783997, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.2903771996498108, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.13571348786354065, + "step": 17870 + }, + { + "epoch": 5.963975983989326, + "loss": 0.7297, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "grad_norm": 1.6686172485351562, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "learning_rate": 0.000295776872599811, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.7089078426361084, + "step": 17880 + }, + { + "ce_loss": 0.12177518755197525, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.3120511472225189, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.11147940903902054, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.8145482540130615, + "step": 17880 + }, + { + "ce_loss": 0.15186285972595215, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.372964471578598, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.12150125205516815, + "step": 17880 + }, + { + "epoch": 5.967311541027351, + "loss": 0.7587, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "grad_norm": 1.8515465259552002, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "learning_rate": 0.0002953597290261512, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.6790962219238281, + "step": 17890 + }, + { + "ce_loss": 0.16520093381404877, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.33001717925071716, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.13531477749347687, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.5637715458869934, + "step": 17890 + }, + { + "ce_loss": 0.1474059373140335, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.24696558713912964, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.13812749087810516, + "step": 17890 + }, + { + "epoch": 5.970647098065377, + "loss": 0.7349, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "grad_norm": 1.8551923036575317, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "learning_rate": 0.00029494270757894733, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 0.7883302569389343, + "step": 17900 + }, + { + "ce_loss": 0.15675842761993408, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.32231029868125916, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.18687209486961365, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 0.6241994500160217, + "step": 17900 + }, + { + "ce_loss": 0.1367034614086151, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.32802969217300415, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.11875439435243607, + "step": 17900 + }, + { + "epoch": 5.973982655103402, + "loss": 0.7017, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "grad_norm": 1.8577299118041992, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "learning_rate": 0.00029452580874490835, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.5608646869659424, + "step": 17910 + }, + { + "ce_loss": 0.11458832770586014, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.24180865287780762, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.12261128425598145, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.7354592084884644, + "step": 17910 + }, + { + "ce_loss": 0.18657203018665314, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.3183566927909851, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.14715497195720673, + "step": 17910 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.7461, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "grad_norm": 1.6920195817947388, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "learning_rate": 0.00029410903301059987, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 1.1519207954406738, + "step": 17920 + }, + { + "ce_loss": 0.22299034893512726, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.4153600335121155, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.22701136767864227, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.7285803556442261, + "step": 17920 + }, + { + "ce_loss": 0.17011786997318268, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.2496124505996704, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.11554224044084549, + "step": 17920 + }, + { + "epoch": 5.980653769179453, + "loss": 0.7015, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "grad_norm": 1.4887522459030151, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "learning_rate": 0.0002936923808624444, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.5744050145149231, + "step": 17930 + }, + { + "ce_loss": 0.1501711755990982, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.22328665852546692, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.16832423210144043, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.6508606672286987, + "step": 17930 + }, + { + "ce_loss": 0.1293591558933258, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.3075183629989624, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.11165163666009903, + "step": 17930 + }, + { + "epoch": 5.983989326217478, + "loss": 0.6488, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "grad_norm": 1.510502576828003, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "learning_rate": 0.0002932758527867196, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.5948408842086792, + "step": 17940 + }, + { + "ce_loss": 0.13838577270507812, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.26575592160224915, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.1424756646156311, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.6306717991828918, + "step": 17940 + }, + { + "ce_loss": 0.14688876271247864, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.2843662202358246, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.1535085290670395, + "step": 17940 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.6957, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "grad_norm": 5.702191352844238, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "learning_rate": 0.0002928594492695586, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.5361848473548889, + "step": 17950 + }, + { + "ce_loss": 0.10649754852056503, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.15731768310070038, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.11794476956129074, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.7358103394508362, + "step": 17950 + }, + { + "ce_loss": 0.11887186020612717, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.19445015490055084, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.10782976448535919, + "step": 17950 + }, + { + "epoch": 5.990660440293529, + "loss": 0.6542, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "grad_norm": 1.7833524942398071, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "learning_rate": 0.00029244317079694915, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.6083264946937561, + "step": 17960 + }, + { + "ce_loss": 0.12414541840553284, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.2250789850950241, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.13253392279148102, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.5254873037338257, + "step": 17960 + }, + { + "ce_loss": 0.1291838437318802, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.25294917821884155, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.11796507984399796, + "step": 17960 + }, + { + "epoch": 5.993995997331554, + "loss": 0.6692, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "grad_norm": 1.4126019477844238, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "learning_rate": 0.0002920270178547329, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.5689122676849365, + "step": 17970 + }, + { + "ce_loss": 0.18033574521541595, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.21025502681732178, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.13225938379764557, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.5316141247749329, + "step": 17970 + }, + { + "ce_loss": 0.1508607119321823, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.22014158964157104, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.12547865509986877, + "step": 17970 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.6219, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "grad_norm": 2.276890277862549, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "learning_rate": 0.00029161099092860527, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.6732531785964966, + "step": 17980 + }, + { + "ce_loss": 0.16249193251132965, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.2430381178855896, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.16172070801258087, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.478391170501709, + "step": 17980 + }, + { + "ce_loss": 0.13364195823669434, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.21999022364616394, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.12411677092313766, + "step": 17980 + }, + { + "epoch": 6.000667111407605, + "loss": 0.7267, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "grad_norm": 3.1330134868621826, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "learning_rate": 0.00029119509050411435, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.6111536026000977, + "step": 17990 + }, + { + "ce_loss": 0.12887738645076752, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.25863373279571533, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.133753702044487, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.43454331159591675, + "step": 17990 + }, + { + "ce_loss": 0.08036650717258453, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.16697892546653748, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.0892324298620224, + "step": 17990 + }, + { + "epoch": 6.00400266844563, + "loss": 0.6487, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "grad_norm": 1.4188312292099, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "learning_rate": 0.0002907793170666606, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.6451430320739746, + "step": 18000 + }, + { + "ce_loss": 0.1146027222275734, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.277157187461853, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.10770218074321747, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.84647536277771, + "step": 18000 + }, + { + "ce_loss": 0.17758968472480774, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.31190869212150574, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.1489148736000061, + "step": 18000 + }, + { + "epoch": 6.007338225483656, + "loss": 0.7068, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "grad_norm": 2.7114202976226807, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "learning_rate": 0.0002903636711014966, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.5021632313728333, + "step": 18010 + }, + { + "ce_loss": 0.09225542098283768, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.27368324995040894, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.0975363552570343, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.6700934171676636, + "step": 18010 + }, + { + "ce_loss": 0.19836029410362244, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.33002209663391113, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.11219165474176407, + "step": 18010 + }, + { + "epoch": 6.010673782521681, + "loss": 0.7095, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "grad_norm": 3.8896095752716064, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "learning_rate": 0.0002899481530937257, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.654662549495697, + "step": 18020 + }, + { + "ce_loss": 0.10406067222356796, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.30926966667175293, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.1180790588259697, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.46157923340797424, + "step": 18020 + }, + { + "ce_loss": 0.0768638551235199, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.25196242332458496, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.08556817471981049, + "step": 18020 + }, + { + "epoch": 6.014009339559706, + "loss": 0.6762, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "grad_norm": 1.584222674369812, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "learning_rate": 0.0002895327635283021, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.6818699836730957, + "step": 18030 + }, + { + "ce_loss": 0.1505424976348877, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.3525632917881012, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.12546685338020325, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.48431217670440674, + "step": 18030 + }, + { + "ce_loss": 0.12620176374912262, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.21929430961608887, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.11054594814777374, + "step": 18030 + }, + { + "epoch": 6.017344896597732, + "loss": 0.7209, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "grad_norm": 3.463015079498291, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "learning_rate": 0.00028911750289003043, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.5016592144966125, + "step": 18040 + }, + { + "ce_loss": 0.12095203250646591, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.27965325117111206, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.10088794678449631, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.8476541042327881, + "step": 18040 + }, + { + "ce_loss": 0.1933862417936325, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.4288746118545532, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.12653212249279022, + "step": 18040 + }, + { + "epoch": 6.020680453635757, + "loss": 0.7125, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "grad_norm": 1.573563814163208, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "learning_rate": 0.00028870237166356424, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.5512456297874451, + "step": 18050 + }, + { + "ce_loss": 0.07267727702856064, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.2865201532840729, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.08283307403326035, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.6286212205886841, + "step": 18050 + }, + { + "ce_loss": 0.1411493569612503, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.30221548676490784, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.1384388655424118, + "step": 18050 + }, + { + "epoch": 6.024016010673782, + "loss": 0.6186, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "grad_norm": 1.77467942237854, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "learning_rate": 0.0002882873703334065, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.525521993637085, + "step": 18060 + }, + { + "ce_loss": 0.1251913160085678, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.305527925491333, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.09452289342880249, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.7104737162590027, + "step": 18060 + }, + { + "ce_loss": 0.09152280539274216, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.22016364336013794, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.1253073811531067, + "step": 18060 + }, + { + "epoch": 6.027351567711808, + "loss": 0.6115, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "grad_norm": 1.388637900352478, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "learning_rate": 0.0002878724993839083, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.7839659452438354, + "step": 18070 + }, + { + "ce_loss": 0.15709657967090607, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.23128685355186462, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.1473115086555481, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.4913570284843445, + "step": 18070 + }, + { + "ce_loss": 0.10992088168859482, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.23345129191875458, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.12018200010061264, + "step": 18070 + }, + { + "epoch": 6.030687124749833, + "loss": 0.605, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "grad_norm": 1.437814712524414, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "learning_rate": 0.0002874577592992688, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.8127552270889282, + "step": 18080 + }, + { + "ce_loss": 0.19625712931156158, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.33098307251930237, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.12643705308437347, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.6383655667304993, + "step": 18080 + }, + { + "ce_loss": 0.14720506966114044, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.2253483533859253, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.11266171187162399, + "step": 18080 + }, + { + "epoch": 6.034022681787858, + "loss": 0.6341, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "grad_norm": 1.5874073505401611, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "learning_rate": 0.00028704315056353414, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.6964699625968933, + "step": 18090 + }, + { + "ce_loss": 0.1716633439064026, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.2955109179019928, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.11812959611415863, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.5283066630363464, + "step": 18090 + }, + { + "ce_loss": 0.12395858019590378, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.23972785472869873, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.10385114699602127, + "step": 18090 + }, + { + "epoch": 6.037358238825884, + "loss": 0.6332, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "grad_norm": 1.2264511585235596, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "learning_rate": 0.00028662867366059756, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.8792027235031128, + "step": 18100 + }, + { + "ce_loss": 0.17050066590309143, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.2821817696094513, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.1407461166381836, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.5104156732559204, + "step": 18100 + }, + { + "ce_loss": 0.14286614954471588, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.2257169485092163, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.10134001076221466, + "step": 18100 + }, + { + "epoch": 6.040693795863909, + "loss": 0.6198, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "grad_norm": 2.148345708847046, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "learning_rate": 0.000286214329074198, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.4621307849884033, + "step": 18110 + }, + { + "ce_loss": 0.0889039933681488, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.23973098397254944, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.08387582004070282, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.597586989402771, + "step": 18110 + }, + { + "ce_loss": 0.18086780607700348, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.2776992619037628, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.13857479393482208, + "step": 18110 + }, + { + "epoch": 6.044029352901934, + "loss": 0.6028, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "grad_norm": 1.5330827236175537, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "learning_rate": 0.0002858001172879202, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.5574997663497925, + "step": 18120 + }, + { + "ce_loss": 0.13679958879947662, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.2656584680080414, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.15457755327224731, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.49670228362083435, + "step": 18120 + }, + { + "ce_loss": 0.1344117373228073, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.23335647583007812, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.09552182257175446, + "step": 18120 + }, + { + "epoch": 6.04736490993996, + "loss": 0.6797, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "grad_norm": 1.3088942766189575, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "learning_rate": 0.00028538603878519407, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.743168294429779, + "step": 18130 + }, + { + "ce_loss": 0.1520257294178009, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.3432425558567047, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.11429043114185333, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.8178507089614868, + "step": 18130 + }, + { + "ce_loss": 0.14292679727077484, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.30670297145843506, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.15452007949352264, + "step": 18130 + }, + { + "epoch": 6.050700466977985, + "loss": 0.7141, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "grad_norm": 1.584452748298645, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "learning_rate": 0.00028497209404929345, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.5253815650939941, + "step": 18140 + }, + { + "ce_loss": 0.13081501424312592, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.27755340933799744, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.11679794639348984, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.7471408247947693, + "step": 18140 + }, + { + "ce_loss": 0.15584027767181396, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.3901218771934509, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.15860076248645782, + "step": 18140 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.6564, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "grad_norm": 1.770574927330017, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "learning_rate": 0.0002845582835633367, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.6072221994400024, + "step": 18150 + }, + { + "ce_loss": 0.13476058840751648, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.29309630393981934, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.10323754698038101, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.4680664837360382, + "step": 18150 + }, + { + "ce_loss": 0.07075583934783936, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.24169869720935822, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.11755290627479553, + "step": 18150 + }, + { + "epoch": 6.057371581054036, + "loss": 0.6628, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "grad_norm": 1.2780784368515015, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "learning_rate": 0.0002841446078102852, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.6397781372070312, + "step": 18160 + }, + { + "ce_loss": 0.12417633086442947, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.2742384672164917, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.12048876285552979, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.8331379890441895, + "step": 18160 + }, + { + "ce_loss": 0.16268602013587952, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.24811343848705292, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.15608720481395721, + "step": 18160 + }, + { + "epoch": 6.060707138092061, + "loss": 0.6458, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "grad_norm": 1.2573291063308716, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "learning_rate": 0.00028373106727294276, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.9425279498100281, + "step": 18170 + }, + { + "ce_loss": 0.17792847752571106, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.3310987949371338, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.12824629247188568, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.5275480151176453, + "step": 18170 + }, + { + "ce_loss": 0.11334899067878723, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.27897968888282776, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.13500186800956726, + "step": 18170 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.6116, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "grad_norm": 1.9847238063812256, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "learning_rate": 0.000283317662433956, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.5519295334815979, + "step": 18180 + }, + { + "ce_loss": 0.14950774610042572, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.26175040006637573, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.14031194150447845, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.4585942029953003, + "step": 18180 + }, + { + "ce_loss": 0.0969991460442543, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.2302173674106598, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.09909547120332718, + "step": 18180 + }, + { + "epoch": 6.067378252168112, + "loss": 0.6563, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "grad_norm": 1.5907666683197021, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "learning_rate": 0.00028290439377581263, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.4612014591693878, + "step": 18190 + }, + { + "ce_loss": 0.10405223071575165, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.2557392120361328, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.10129746794700623, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.5775993466377258, + "step": 18190 + }, + { + "ce_loss": 0.1516268253326416, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.3129112720489502, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.11288562417030334, + "step": 18190 + }, + { + "epoch": 6.070713809206137, + "loss": 0.5735, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "grad_norm": 1.4031167030334473, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "learning_rate": 0.0002824912617808418, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.7670556902885437, + "step": 18200 + }, + { + "ce_loss": 0.18870723247528076, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.37014707922935486, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.16599838435649872, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.6280184984207153, + "step": 18200 + }, + { + "ce_loss": 0.13828735053539276, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.30158185958862305, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.12955552339553833, + "step": 18200 + }, + { + "epoch": 6.074049366244163, + "loss": 0.7286, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "grad_norm": 1.8095005750656128, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "learning_rate": 0.00028207826693121287, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.7221490144729614, + "step": 18210 + }, + { + "ce_loss": 0.17217539250850677, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.30401742458343506, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.12987874448299408, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.6990206241607666, + "step": 18210 + }, + { + "ce_loss": 0.1532217115163803, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.342460036277771, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.15716324746608734, + "step": 18210 + }, + { + "epoch": 6.077384923282188, + "loss": 0.6538, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "grad_norm": 1.2875056266784668, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "learning_rate": 0.0002816654097089354, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.9460160732269287, + "step": 18220 + }, + { + "ce_loss": 0.1782992035150528, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.30229833722114563, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.11988240480422974, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.6548318862915039, + "step": 18220 + }, + { + "ce_loss": 0.14591868221759796, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.3423047363758087, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.11783643066883087, + "step": 18220 + }, + { + "epoch": 6.080720480320213, + "loss": 0.7018, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "grad_norm": 1.6654572486877441, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "learning_rate": 0.0002812526905958581, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.6765292882919312, + "step": 18230 + }, + { + "ce_loss": 0.13385047018527985, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.3190274238586426, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.12536045908927917, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.4414009153842926, + "step": 18230 + }, + { + "ce_loss": 0.0661691203713417, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.160070538520813, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.08345791697502136, + "step": 18230 + }, + { + "epoch": 6.084056037358239, + "loss": 0.6399, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "grad_norm": 1.5034816265106201, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "learning_rate": 0.0002808401100736687, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.8535445332527161, + "step": 18240 + }, + { + "ce_loss": 0.22564417123794556, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.3347397446632385, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.13953503966331482, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.6334425806999207, + "step": 18240 + }, + { + "ce_loss": 0.18967677652835846, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.3053939938545227, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.138227179646492, + "step": 18240 + }, + { + "epoch": 6.087391594396264, + "loss": 0.5813, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "grad_norm": 2.3025712966918945, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "learning_rate": 0.0002804276686238928, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.4579137861728668, + "step": 18250 + }, + { + "ce_loss": 0.10760626941919327, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.22375169396400452, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.12626978754997253, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.5572217702865601, + "step": 18250 + }, + { + "ce_loss": 0.14132961630821228, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.25528982281684875, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.1243145763874054, + "step": 18250 + }, + { + "epoch": 6.090727151434289, + "loss": 0.5813, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "grad_norm": 1.3174196481704712, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "learning_rate": 0.00028001536672789414, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.5696294903755188, + "step": 18260 + }, + { + "ce_loss": 0.15350523591041565, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.26622551679611206, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.12345647066831589, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.5882666707038879, + "step": 18260 + }, + { + "ce_loss": 0.0760028064250946, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.24794188141822815, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.11833299696445465, + "step": 18260 + }, + { + "epoch": 6.094062708472315, + "loss": 0.5987, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "grad_norm": 1.450992226600647, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "learning_rate": 0.0002796032048668734, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.6237502694129944, + "step": 18270 + }, + { + "ce_loss": 0.18267591297626495, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.29955023527145386, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.14133091270923615, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.5703005790710449, + "step": 18270 + }, + { + "ce_loss": 0.12525999546051025, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.2633492350578308, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.14371177554130554, + "step": 18270 + }, + { + "epoch": 6.09739826551034, + "loss": 0.6531, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "grad_norm": 1.9310940504074097, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "learning_rate": 0.000279191183521868, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.6407101154327393, + "step": 18280 + }, + { + "ce_loss": 0.13469745218753815, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.302370548248291, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.15269644558429718, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.462386816740036, + "step": 18280 + }, + { + "ce_loss": 0.0804935097694397, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.26779067516326904, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.08050373941659927, + "step": 18280 + }, + { + "epoch": 6.100733822548365, + "loss": 0.6526, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "grad_norm": 1.4544947147369385, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "learning_rate": 0.00027877930317375086, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.5759723782539368, + "step": 18290 + }, + { + "ce_loss": 0.0904451534152031, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.31251609325408936, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.13719967007637024, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.5790067911148071, + "step": 18290 + }, + { + "ce_loss": 0.1277799904346466, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.30773675441741943, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.14318691194057465, + "step": 18290 + }, + { + "epoch": 6.104069379586391, + "loss": 0.6527, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "grad_norm": 1.80874502658844, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "learning_rate": 0.0002783675643032308, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.570338249206543, + "step": 18300 + }, + { + "ce_loss": 0.13978049159049988, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.26612356305122375, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.13457608222961426, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.5437963008880615, + "step": 18300 + }, + { + "ce_loss": 0.10671614110469818, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.2908667325973511, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.10786082595586777, + "step": 18300 + }, + { + "epoch": 6.107404936624416, + "loss": 0.7081, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "grad_norm": 1.6988565921783447, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "learning_rate": 0.0002779559673908514, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.5674381256103516, + "step": 18310 + }, + { + "ce_loss": 0.12075648456811905, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.2612060308456421, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.09225589036941528, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.8771853446960449, + "step": 18310 + }, + { + "ce_loss": 0.17971204221248627, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.2916775941848755, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.1181989461183548, + "step": 18310 + }, + { + "epoch": 6.110740493662441, + "loss": 0.6473, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "grad_norm": 1.7148319482803345, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "learning_rate": 0.00027754451291699063, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.5265355110168457, + "step": 18320 + }, + { + "ce_loss": 0.10530180484056473, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.2391783744096756, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.12191746383905411, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.5689677000045776, + "step": 18320 + }, + { + "ce_loss": 0.10074944794178009, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.3271622061729431, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.11129195988178253, + "step": 18320 + }, + { + "epoch": 6.114076050700467, + "loss": 0.6375, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "grad_norm": 1.5958406925201416, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "learning_rate": 0.0002771332013618599, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.5165610909461975, + "step": 18330 + }, + { + "ce_loss": 0.09935726225376129, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.2832036018371582, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.13384084403514862, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.5234933495521545, + "step": 18330 + }, + { + "ce_loss": 0.07920798659324646, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.2518523037433624, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.11808153241872787, + "step": 18330 + }, + { + "epoch": 6.117411607738492, + "loss": 0.6802, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "grad_norm": 1.1820212602615356, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "learning_rate": 0.00027672203320550434, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.6534271240234375, + "step": 18340 + }, + { + "ce_loss": 0.18147516250610352, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.27505460381507874, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.11951899528503418, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.7164322733879089, + "step": 18340 + }, + { + "ce_loss": 0.1533847153186798, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.22848907113075256, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.10289295017719269, + "step": 18340 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.6214, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "grad_norm": 1.0374150276184082, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "learning_rate": 0.00027631100892780116, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.5371941328048706, + "step": 18350 + }, + { + "ce_loss": 0.18888607621192932, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.23849518597126007, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.10944174975156784, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.40661856532096863, + "step": 18350 + }, + { + "ce_loss": 0.09315002709627151, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.20016992092132568, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.11312264204025269, + "step": 18350 + }, + { + "epoch": 6.124082721814543, + "loss": 0.5196, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "grad_norm": 1.4926507472991943, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "learning_rate": 0.00027590012900846, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.41106167435646057, + "step": 18360 + }, + { + "ce_loss": 0.08867556601762772, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.19158530235290527, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.09265054017305374, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.4671410620212555, + "step": 18360 + }, + { + "ce_loss": 0.13322880864143372, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.2158261090517044, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.11775711923837662, + "step": 18360 + }, + { + "epoch": 6.127418278852568, + "loss": 0.5083, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "grad_norm": 1.4729706048965454, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "learning_rate": 0.0002754893939270221, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 0.5703041553497314, + "step": 18370 + }, + { + "ce_loss": 0.11886131763458252, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.2752629220485687, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.10208799690008163, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 1.0630515813827515, + "step": 18370 + }, + { + "ce_loss": 0.12333641946315765, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.27853256464004517, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.08770790696144104, + "step": 18370 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.7476, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "grad_norm": 1.7509486675262451, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "learning_rate": 0.0002750788041628593, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.5735849142074585, + "step": 18380 + }, + { + "ce_loss": 0.16998191177845, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.3000338077545166, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.07679232954978943, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.6713649034500122, + "step": 18380 + }, + { + "ce_loss": 0.12456465512514114, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.32041725516319275, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.1309889256954193, + "step": 18380 + }, + { + "epoch": 6.134089392928619, + "loss": 0.717, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "grad_norm": 1.6819425821304321, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "learning_rate": 0.0002746683601951743, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.5657597780227661, + "step": 18390 + }, + { + "ce_loss": 0.10514745861291885, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.26283934712409973, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.12309848517179489, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.8041913509368896, + "step": 18390 + }, + { + "ce_loss": 0.17058131098747253, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.33370333909988403, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.13370048999786377, + "step": 18390 + }, + { + "epoch": 6.137424949966644, + "loss": 0.6625, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "grad_norm": 1.8284673690795898, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "learning_rate": 0.00027425806250299897, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.6972324848175049, + "step": 18400 + }, + { + "ce_loss": 0.17571482062339783, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.3037504255771637, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.1777459979057312, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.6110414862632751, + "step": 18400 + }, + { + "ce_loss": 0.14571216702461243, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.33484402298927307, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.13004137575626373, + "step": 18400 + }, + { + "epoch": 6.14076050700467, + "loss": 0.5864, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "grad_norm": 1.7372549772262573, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "learning_rate": 0.0002738479115651953, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.43518373370170593, + "step": 18410 + }, + { + "ce_loss": 0.11351682245731354, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.19356505572795868, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.12793107330799103, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.5647701025009155, + "step": 18410 + }, + { + "ce_loss": 0.13868297636508942, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.25933837890625, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.133583664894104, + "step": 18410 + }, + { + "epoch": 6.144096064042695, + "loss": 0.6318, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "grad_norm": 1.194664478302002, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "learning_rate": 0.0002734379078604532, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.4552597105503082, + "step": 18420 + }, + { + "ce_loss": 0.09872724860906601, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.23186928033828735, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.0990760400891304, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.7257994413375854, + "step": 18420 + }, + { + "ce_loss": 0.2151937484741211, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.26309436559677124, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.14622631669044495, + "step": 18420 + }, + { + "epoch": 6.14743162108072, + "loss": 0.6626, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "grad_norm": 2.8523292541503906, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "learning_rate": 0.00027302805186729136, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.6845064759254456, + "step": 18430 + }, + { + "ce_loss": 0.13927432894706726, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.3564937114715576, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.14247988164424896, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.6814566254615784, + "step": 18430 + }, + { + "ce_loss": 0.19597093760967255, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.3532121181488037, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.13141068816184998, + "step": 18430 + }, + { + "epoch": 6.150767178118746, + "loss": 0.6815, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "grad_norm": 2.0527751445770264, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "learning_rate": 0.0002726183440640557, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.6522032022476196, + "step": 18440 + }, + { + "ce_loss": 0.1364910751581192, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.2703036665916443, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.13433896005153656, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.6549949645996094, + "step": 18440 + }, + { + "ce_loss": 0.15011869370937347, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.3180496394634247, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.12303827702999115, + "step": 18440 + }, + { + "epoch": 6.154102735156771, + "loss": 0.6876, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "grad_norm": 1.4469850063323975, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "learning_rate": 0.0002722087849289194, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.8605791330337524, + "step": 18450 + }, + { + "ce_loss": 0.18606171011924744, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.3016606867313385, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.1333146095275879, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.6281872391700745, + "step": 18450 + }, + { + "ce_loss": 0.130160853266716, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.3120817244052887, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.11217764019966125, + "step": 18450 + }, + { + "epoch": 6.157438292194796, + "loss": 0.594, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "grad_norm": 1.4657039642333984, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "learning_rate": 0.0002717993749398819, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.6093101501464844, + "step": 18460 + }, + { + "ce_loss": 0.12690822780132294, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.2589327096939087, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.1347208470106125, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.6367437243461609, + "step": 18460 + }, + { + "ce_loss": 0.15475201606750488, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.29213953018188477, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.133981853723526, + "step": 18460 + }, + { + "epoch": 6.160773849232822, + "loss": 0.6148, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "grad_norm": 2.235093116760254, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "learning_rate": 0.0002713901145747687, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 1.0750973224639893, + "step": 18470 + }, + { + "ce_loss": 0.1199803277850151, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.21382658183574677, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.10405635833740234, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 0.7381834983825684, + "step": 18470 + }, + { + "ce_loss": 0.11732760816812515, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.19893065094947815, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.14228032529354095, + "step": 18470 + }, + { + "epoch": 6.164109406270847, + "loss": 0.6314, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "grad_norm": 1.4539425373077393, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "learning_rate": 0.00027098100431123095, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 0.617901623249054, + "step": 18480 + }, + { + "ce_loss": 0.16611261665821075, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.2678413391113281, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.11507727205753326, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 1.0038886070251465, + "step": 18480 + }, + { + "ce_loss": 0.17187930643558502, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.27773842215538025, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.16759958863258362, + "step": 18480 + }, + { + "epoch": 6.167444963308872, + "loss": 0.6242, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "grad_norm": 1.6894118785858154, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "learning_rate": 0.0002705720446267442, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.5968695878982544, + "step": 18490 + }, + { + "ce_loss": 0.13667964935302734, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.23305979371070862, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.12112811952829361, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.5195899605751038, + "step": 18490 + }, + { + "ce_loss": 0.11936408281326294, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.269527405500412, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.097865030169487, + "step": 18490 + }, + { + "epoch": 6.170780520346898, + "loss": 0.6507, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "grad_norm": 1.6241803169250488, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "learning_rate": 0.0002701632359986083, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.7157676815986633, + "step": 18500 + }, + { + "ce_loss": 0.1360265463590622, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.2981744110584259, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.1476881355047226, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.630430281162262, + "step": 18500 + }, + { + "ce_loss": 0.1654825359582901, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.23207513988018036, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.123602956533432, + "step": 18500 + }, + { + "epoch": 6.174116077384923, + "loss": 0.6731, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "grad_norm": 2.015723466873169, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "learning_rate": 0.0002697545789039472, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.5642631649971008, + "step": 18510 + }, + { + "ce_loss": 0.1319328248500824, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.295981228351593, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.13301639258861542, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.7798033952713013, + "step": 18510 + }, + { + "ce_loss": 0.11101414263248444, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.22768959403038025, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.13759620487689972, + "step": 18510 + }, + { + "epoch": 6.177451634422948, + "loss": 0.6435, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "grad_norm": 3.946340560913086, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "learning_rate": 0.00026934607381970735, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.5880672931671143, + "step": 18520 + }, + { + "ce_loss": 0.1387503296136856, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.2876739501953125, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.12377411127090454, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.6849022507667542, + "step": 18520 + }, + { + "ce_loss": 0.18922367691993713, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.3700937330722809, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.1246335506439209, + "step": 18520 + }, + { + "epoch": 6.180787191460974, + "loss": 0.5849, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "grad_norm": 1.7427726984024048, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "learning_rate": 0.0002689377212226583, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.5720622539520264, + "step": 18530 + }, + { + "ce_loss": 0.09880984574556351, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.18975934386253357, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.12640048563480377, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.5118294954299927, + "step": 18530 + }, + { + "ce_loss": 0.11647960543632507, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.23939764499664307, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.09006281197071075, + "step": 18530 + }, + { + "epoch": 6.184122748498999, + "loss": 0.5788, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "grad_norm": 2.905853509902954, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "learning_rate": 0.0002685295215893915, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.7311304807662964, + "step": 18540 + }, + { + "ce_loss": 0.10905653983354568, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.24801619350910187, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.11523216217756271, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.8149339556694031, + "step": 18540 + }, + { + "ce_loss": 0.19311216473579407, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.29600635170936584, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.13978679478168488, + "step": 18540 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.6132, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "grad_norm": 1.2603939771652222, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "learning_rate": 0.0002681214753963198, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.41944119334220886, + "step": 18550 + }, + { + "ce_loss": 0.07490330934524536, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.21885143220424652, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.09496767073869705, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.7222049832344055, + "step": 18550 + }, + { + "ce_loss": 0.12777599692344666, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.27243223786354065, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.07724441587924957, + "step": 18550 + }, + { + "epoch": 6.19079386257505, + "loss": 0.6462, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "grad_norm": 2.3181214332580566, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "learning_rate": 0.0002677135831196771, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.43296733498573303, + "step": 18560 + }, + { + "ce_loss": 0.08375538140535355, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.23206079006195068, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.09651433676481247, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.809476912021637, + "step": 18560 + }, + { + "ce_loss": 0.15913410484790802, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.3202086091041565, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.12562784552574158, + "step": 18560 + }, + { + "epoch": 6.194129419613075, + "loss": 0.6267, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "grad_norm": 1.503143548965454, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "learning_rate": 0.00026730584523551744, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.6719030141830444, + "step": 18570 + }, + { + "ce_loss": 0.11842752993106842, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.2962538003921509, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.12865594029426575, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.6024667620658875, + "step": 18570 + }, + { + "ce_loss": 0.15397170186042786, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.2786436975002289, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.1400149017572403, + "step": 18570 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.6434, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "grad_norm": 1.6247886419296265, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "learning_rate": 0.0002668982622197148, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.6952130794525146, + "step": 18580 + }, + { + "ce_loss": 0.08196988701820374, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.2731947898864746, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.08738795667886734, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.5540481209754944, + "step": 18580 + }, + { + "ce_loss": 0.11001595109701157, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.24366402626037598, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.12300706654787064, + "step": 18580 + }, + { + "epoch": 6.200800533689126, + "loss": 0.6145, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "grad_norm": 1.7628047466278076, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "learning_rate": 0.0002664908345479625, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.5917359590530396, + "step": 18590 + }, + { + "ce_loss": 0.12417437136173248, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.24398832023143768, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.11655285954475403, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.5952635407447815, + "step": 18590 + }, + { + "ce_loss": 0.1612391471862793, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.2423071712255478, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.1544157713651657, + "step": 18590 + }, + { + "epoch": 6.204136090727151, + "loss": 0.6139, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "grad_norm": 1.9439404010772705, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "learning_rate": 0.0002660835626957726, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.4388802647590637, + "step": 18600 + }, + { + "ce_loss": 0.08696887642145157, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.17513489723205566, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.11585807055234909, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.3982605040073395, + "step": 18600 + }, + { + "ce_loss": 0.08773314952850342, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.19973807036876678, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.11063937097787857, + "step": 18600 + }, + { + "epoch": 6.207471647765177, + "loss": 0.5959, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "grad_norm": 2.576935291290283, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "learning_rate": 0.0002656764471384749, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.4419233202934265, + "step": 18610 + }, + { + "ce_loss": 0.08523976802825928, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.16447491943836212, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.0948585495352745, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.5787880420684814, + "step": 18610 + }, + { + "ce_loss": 0.12167631834745407, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.20232316851615906, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.1111878976225853, + "step": 18610 + }, + { + "epoch": 6.210807204803202, + "loss": 0.6047, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "grad_norm": 1.482259750366211, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "learning_rate": 0.0002652694883512173, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.6412470936775208, + "step": 18620 + }, + { + "ce_loss": 0.1887206733226776, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.2719080448150635, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.1438676118850708, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.6414347290992737, + "step": 18620 + }, + { + "ce_loss": 0.1531907618045807, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.25418105721473694, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.12958569824695587, + "step": 18620 + }, + { + "epoch": 6.214142761841227, + "loss": 0.5762, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "grad_norm": 1.4014415740966797, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "learning_rate": 0.0002648626868089644, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.6205796003341675, + "step": 18630 + }, + { + "ce_loss": 0.12997017800807953, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.25296303629875183, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.12407161295413971, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.5109315514564514, + "step": 18630 + }, + { + "ce_loss": 0.11794036626815796, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.2793790400028229, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.08065132796764374, + "step": 18630 + }, + { + "epoch": 6.217478318879253, + "loss": 0.6147, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "grad_norm": 1.4799383878707886, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "learning_rate": 0.00026445604298649727, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.517666220664978, + "step": 18640 + }, + { + "ce_loss": 0.12360420823097229, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.24259185791015625, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.11556177586317062, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.6571850776672363, + "step": 18640 + }, + { + "ce_loss": 0.09585979580879211, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.2677266597747803, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.10569890588521957, + "step": 18640 + }, + { + "epoch": 6.220813875917278, + "loss": 0.5984, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "grad_norm": 1.4804229736328125, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "learning_rate": 0.00026404955735841325, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.5628892183303833, + "step": 18650 + }, + { + "ce_loss": 0.11343202739953995, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.29269713163375854, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.1565801501274109, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.7792778015136719, + "step": 18650 + }, + { + "ce_loss": 0.14788763225078583, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.3236067295074463, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.14538931846618652, + "step": 18650 + }, + { + "epoch": 6.224149432955303, + "loss": 0.6762, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "grad_norm": 1.1050039529800415, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "learning_rate": 0.0002636432303991245, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.5967389345169067, + "step": 18660 + }, + { + "ce_loss": 0.11555395275354385, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.27795520424842834, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.11349157243967056, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.5443832874298096, + "step": 18660 + }, + { + "ce_loss": 0.15342643857002258, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.2519240081310272, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.09871362149715424, + "step": 18660 + }, + { + "epoch": 6.227484989993329, + "loss": 0.6513, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "grad_norm": 6.464293956756592, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "learning_rate": 0.00026323706258285864, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.5531576871871948, + "step": 18670 + }, + { + "ce_loss": 0.07649177312850952, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.3307036757469177, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.10162324458360672, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.5954231023788452, + "step": 18670 + }, + { + "ce_loss": 0.1320202797651291, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.23801815509796143, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.11234842240810394, + "step": 18670 + }, + { + "epoch": 6.230820547031354, + "loss": 0.5936, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "grad_norm": 2.4922399520874023, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "learning_rate": 0.00026283105438365697, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.6484638452529907, + "step": 18680 + }, + { + "ce_loss": 0.1432270109653473, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.338364839553833, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.13439004123210907, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.6381657123565674, + "step": 18680 + }, + { + "ce_loss": 0.12378459423780441, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.3475711941719055, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.12904280424118042, + "step": 18680 + }, + { + "epoch": 6.234156104069379, + "loss": 0.6054, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "grad_norm": 1.3250396251678467, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "learning_rate": 0.00026242520627537465, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.43528658151626587, + "step": 18690 + }, + { + "ce_loss": 0.08029738813638687, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.2519910931587219, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.1028536707162857, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.5063867568969727, + "step": 18690 + }, + { + "ce_loss": 0.08997374773025513, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.2542618215084076, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.11495349556207657, + "step": 18690 + }, + { + "epoch": 6.237491661107405, + "loss": 0.5642, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "grad_norm": 2.5319509506225586, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "learning_rate": 0.0002620195187316805, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.5081156492233276, + "step": 18700 + }, + { + "ce_loss": 0.10081464052200317, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.23042288422584534, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.08740917593240738, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.6952100396156311, + "step": 18700 + }, + { + "ce_loss": 0.1750820428133011, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.2979185879230499, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.12627197802066803, + "step": 18700 + }, + { + "epoch": 6.24082721814543, + "loss": 0.6507, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "grad_norm": 1.9645556211471558, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "learning_rate": 0.00026161399222605523, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.7466223239898682, + "step": 18710 + }, + { + "ce_loss": 0.21908186376094818, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.2823524475097656, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.1386713981628418, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.5161977410316467, + "step": 18710 + }, + { + "ce_loss": 0.08397981524467468, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.2072904109954834, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.11395914852619171, + "step": 18710 + }, + { + "epoch": 6.244162775183455, + "loss": 0.5942, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "grad_norm": 1.0424362421035767, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "learning_rate": 0.00026120862723179203, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.6296104192733765, + "step": 18720 + }, + { + "ce_loss": 0.16182675957679749, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.281975656747818, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.15327884256839752, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.5066336989402771, + "step": 18720 + }, + { + "ce_loss": 0.13477244973182678, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.2248753309249878, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.11454130709171295, + "step": 18720 + }, + { + "epoch": 6.247498332221481, + "loss": 0.5769, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "grad_norm": 1.3180054426193237, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "learning_rate": 0.00026080342422199536, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.5453575849533081, + "step": 18730 + }, + { + "ce_loss": 0.12045739591121674, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.2378164380788803, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.11338043212890625, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.8507588505744934, + "step": 18730 + }, + { + "ce_loss": 0.16890116035938263, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.3025205135345459, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.14612118899822235, + "step": 18730 + }, + { + "epoch": 6.250833889259506, + "loss": 0.6153, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "grad_norm": 1.7155362367630005, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "learning_rate": 0.00026039838366958087, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.7768467664718628, + "step": 18740 + }, + { + "ce_loss": 0.10443000495433807, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.3253715932369232, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.09035629034042358, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.5813509225845337, + "step": 18740 + }, + { + "ce_loss": 0.1278662383556366, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.28240811824798584, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.12653017044067383, + "step": 18740 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.6759, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "grad_norm": 1.5573009252548218, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "learning_rate": 0.0002599935060472743, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.6010094881057739, + "step": 18750 + }, + { + "ce_loss": 0.1629936397075653, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.27384552359580994, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.13688445091247559, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.8907907605171204, + "step": 18750 + }, + { + "ce_loss": 0.1886061728000641, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.3372310996055603, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.16175268590450287, + "step": 18750 + }, + { + "epoch": 6.257505003335557, + "loss": 0.6196, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "grad_norm": 1.3572313785552979, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "learning_rate": 0.0002595887918276116, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.5978425741195679, + "step": 18760 + }, + { + "ce_loss": 0.17033596336841583, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.2939685583114624, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.10807149857282639, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.5586941838264465, + "step": 18760 + }, + { + "ce_loss": 0.09725350141525269, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.2762550711631775, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.13775742053985596, + "step": 18760 + }, + { + "epoch": 6.260840560373582, + "loss": 0.5952, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "grad_norm": 1.4143478870391846, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "learning_rate": 0.0002591842414829376, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.36823388934135437, + "step": 18770 + }, + { + "ce_loss": 0.06665971130132675, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.20049230754375458, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.1006406843662262, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.4194163382053375, + "step": 18770 + }, + { + "ce_loss": 0.09155339002609253, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.22316360473632812, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.10426671802997589, + "step": 18770 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.5954, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "grad_norm": 2.267418622970581, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "learning_rate": 0.0002587798554854063, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.579855740070343, + "step": 18780 + }, + { + "ce_loss": 0.14227893948554993, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.2690858244895935, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.1261933445930481, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.5423168540000916, + "step": 18780 + }, + { + "ce_loss": 0.10789535194635391, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.243142232298851, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.13346213102340698, + "step": 18780 + }, + { + "epoch": 6.267511674449633, + "loss": 0.6002, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "grad_norm": 1.1807934045791626, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "learning_rate": 0.00025837563430697953, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.3969920575618744, + "step": 18790 + }, + { + "ce_loss": 0.0700957402586937, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.196085587143898, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.09772870689630508, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.8665177822113037, + "step": 18790 + }, + { + "ce_loss": 0.10357936471700668, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.2201247364282608, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.09220992028713226, + "step": 18790 + }, + { + "epoch": 6.270847231487658, + "loss": 0.6304, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "grad_norm": 1.4579709768295288, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "learning_rate": 0.00025797157841942674, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 0.5386521816253662, + "step": 18800 + }, + { + "ce_loss": 0.13596661388874054, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.257935106754303, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.11433033645153046, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 1.0312557220458984, + "step": 18800 + }, + { + "ce_loss": 0.10936436057090759, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.25603824853897095, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.10420667380094528, + "step": 18800 + }, + { + "epoch": 6.274182788525684, + "loss": 0.6355, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "grad_norm": 1.4766558408737183, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "learning_rate": 0.00025756768829432496, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.7762137055397034, + "step": 18810 + }, + { + "ce_loss": 0.14689825475215912, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.23592166602611542, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.13268359005451202, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.5970646739006042, + "step": 18810 + }, + { + "ce_loss": 0.16350924968719482, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.2649400532245636, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.12570339441299438, + "step": 18810 + }, + { + "epoch": 6.277518345563709, + "loss": 0.6227, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "grad_norm": 1.5304725170135498, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "learning_rate": 0.0002571639644030574, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.5977932214736938, + "step": 18820 + }, + { + "ce_loss": 0.13757960498332977, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.2880479097366333, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.13617762923240662, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.6033172011375427, + "step": 18820 + }, + { + "ce_loss": 0.14228224754333496, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.28515636920928955, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.12235098332166672, + "step": 18820 + }, + { + "epoch": 6.280853902601734, + "loss": 0.6056, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "grad_norm": 4.085591793060303, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "learning_rate": 0.00025676040721681303, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 1.065446376800537, + "step": 18830 + }, + { + "ce_loss": 0.13691724836826324, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.28984662890434265, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.1384049654006958, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 0.8404492139816284, + "step": 18830 + }, + { + "ce_loss": 0.18925227224826813, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.3313180208206177, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.14136530458927155, + "step": 18830 + }, + { + "epoch": 6.28418945963976, + "loss": 0.6549, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "grad_norm": 1.796584129333496, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "learning_rate": 0.00025635701720658677, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.5860927700996399, + "step": 18840 + }, + { + "ce_loss": 0.1099044606089592, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.24969011545181274, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.1132139191031456, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.6870462894439697, + "step": 18840 + }, + { + "ce_loss": 0.15823398530483246, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.3364105820655823, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.1049603670835495, + "step": 18840 + }, + { + "epoch": 6.287525016677785, + "loss": 0.6795, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "grad_norm": 2.2475714683532715, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "learning_rate": 0.000255953794843178, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.4239915609359741, + "step": 18850 + }, + { + "ce_loss": 0.07156173139810562, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.2099079191684723, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.11280690878629684, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.4179385006427765, + "step": 18850 + }, + { + "ce_loss": 0.0768141821026802, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.20585425198078156, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.10265933722257614, + "step": 18850 + }, + { + "epoch": 6.29086057371581, + "loss": 0.7121, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "grad_norm": 3.4591379165649414, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "learning_rate": 0.00025555074059719073, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.3988853096961975, + "step": 18860 + }, + { + "ce_loss": 0.07139135897159576, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.21063552796840668, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.084518663585186, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.6735532879829407, + "step": 18860 + }, + { + "ce_loss": 0.16425824165344238, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.33580243587493896, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.12752461433410645, + "step": 18860 + }, + { + "epoch": 6.294196130753836, + "loss": 0.6134, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "grad_norm": 1.5801092386245728, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "learning_rate": 0.0002551478549390325, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 0.38322246074676514, + "step": 18870 + }, + { + "ce_loss": 0.07935591042041779, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.22366979718208313, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.06230417639017105, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 1.041303277015686, + "step": 18870 + }, + { + "ce_loss": 0.13986216485500336, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.27043458819389343, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.10601771622896194, + "step": 18870 + }, + { + "epoch": 6.297531687791861, + "loss": 0.6444, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "grad_norm": 4.337099552154541, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "learning_rate": 0.00025474513833891434, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.6033490300178528, + "step": 18880 + }, + { + "ce_loss": 0.16330893337726593, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.25854185223579407, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.1479196399450302, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.40364956855773926, + "step": 18880 + }, + { + "ce_loss": 0.08617854118347168, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.18217800557613373, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.11525102704763412, + "step": 18880 + }, + { + "epoch": 6.300867244829886, + "loss": 0.5843, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "grad_norm": 1.3575999736785889, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "learning_rate": 0.00025434259126684973, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.7759307622909546, + "step": 18890 + }, + { + "ce_loss": 0.171718567609787, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.26840507984161377, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.12806598842144012, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.7826226949691772, + "step": 18890 + }, + { + "ce_loss": 0.16975845396518707, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.3657705783843994, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.15103502571582794, + "step": 18890 + }, + { + "epoch": 6.304202801867912, + "loss": 0.634, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "grad_norm": 1.8236695528030396, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "learning_rate": 0.0002539402141926546, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.6085480451583862, + "step": 18900 + }, + { + "ce_loss": 0.10115724802017212, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.3330099880695343, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.1174900159239769, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.46579235792160034, + "step": 18900 + }, + { + "ce_loss": 0.10644608736038208, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.20871976017951965, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.10825169831514359, + "step": 18900 + }, + { + "epoch": 6.307538358905937, + "loss": 0.5919, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "grad_norm": 1.4280165433883667, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "learning_rate": 0.000253538007585946, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.5173388123512268, + "step": 18910 + }, + { + "ce_loss": 0.12827540934085846, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.2661437392234802, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.12170099467039108, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.5308101177215576, + "step": 18910 + }, + { + "ce_loss": 0.11813155561685562, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.24241910874843597, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.12783372402191162, + "step": 18910 + }, + { + "epoch": 6.310873915943962, + "loss": 0.6298, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "grad_norm": 1.4830825328826904, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "learning_rate": 0.0002531359719161426, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.6821950674057007, + "step": 18920 + }, + { + "ce_loss": 0.16001774370670319, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.2875783145427704, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.13389131426811218, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.7266680002212524, + "step": 18920 + }, + { + "ce_loss": 0.20016908645629883, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.37826067209243774, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.14504815638065338, + "step": 18920 + }, + { + "epoch": 6.314209472981988, + "loss": 0.6279, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "grad_norm": 1.6696503162384033, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "learning_rate": 0.0002527341076524633, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.5761136412620544, + "step": 18930 + }, + { + "ce_loss": 0.11060384660959244, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.25381311774253845, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.1130882278084755, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.8104597330093384, + "step": 18930 + }, + { + "ce_loss": 0.16445434093475342, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.33795028924942017, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.14992178976535797, + "step": 18930 + }, + { + "epoch": 6.317545030020013, + "loss": 0.657, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "grad_norm": 1.4184163808822632, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "learning_rate": 0.00025233241526392673, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.5488178730010986, + "step": 18940 + }, + { + "ce_loss": 0.15158149600028992, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.29170292615890503, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.10527343302965164, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.5522985458374023, + "step": 18940 + }, + { + "ce_loss": 0.10687598586082458, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.28288522362709045, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.11455663293600082, + "step": 18940 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.6514, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "grad_norm": 2.5147652626037598, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "learning_rate": 0.0002519308952193513, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.6704400777816772, + "step": 18950 + }, + { + "ce_loss": 0.1592797189950943, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.24356558918952942, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.0978129580616951, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.5375022292137146, + "step": 18950 + }, + { + "ce_loss": 0.11981920897960663, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.24265658855438232, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.14514128863811493, + "step": 18950 + }, + { + "epoch": 6.324216144096064, + "loss": 0.5806, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "grad_norm": 2.096982717514038, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "learning_rate": 0.000251529547987354, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.6203213930130005, + "step": 18960 + }, + { + "ce_loss": 0.10694719105958939, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.21681025624275208, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.13992996513843536, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.6286050081253052, + "step": 18960 + }, + { + "ce_loss": 0.1949644386768341, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.21977798640727997, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.14627158641815186, + "step": 18960 + }, + { + "epoch": 6.327551701134089, + "loss": 0.6171, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "grad_norm": 1.7369863986968994, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "learning_rate": 0.0002511283740363504, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.6021354794502258, + "step": 18970 + }, + { + "ce_loss": 0.14903834462165833, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.27035075426101685, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.13529963791370392, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.6092727184295654, + "step": 18970 + }, + { + "ce_loss": 0.12355395406484604, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.305045485496521, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.13894686102867126, + "step": 18970 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.6264, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "grad_norm": 1.227952003479004, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "learning_rate": 0.0002507273738345534, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.5542557239532471, + "step": 18980 + }, + { + "ce_loss": 0.08878917247056961, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.2355813980102539, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.09402923285961151, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.6132735013961792, + "step": 18980 + }, + { + "ce_loss": 0.15250413119792938, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.26554346084594727, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.09666182845830917, + "step": 18980 + }, + { + "epoch": 6.33422281521014, + "loss": 0.6429, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "grad_norm": 1.4251363277435303, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "learning_rate": 0.0002503265478499736, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.4695286154747009, + "step": 18990 + }, + { + "ce_loss": 0.10755734145641327, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.19876904785633087, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.1363854706287384, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.4856092929840088, + "step": 18990 + }, + { + "ce_loss": 0.10343562811613083, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.24028657376766205, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.10447201877832413, + "step": 18990 + }, + { + "epoch": 6.337558372248165, + "loss": 0.6261, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "grad_norm": 10.998090744018555, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "learning_rate": 0.0002499258965504179, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.7924450635910034, + "step": 19000 + }, + { + "ce_loss": 0.16777165234088898, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.2919318675994873, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.16161774098873138, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.6410017013549805, + "step": 19000 + }, + { + "ce_loss": 0.15153133869171143, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.2678484320640564, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.10910286009311676, + "step": 19000 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.6259, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "grad_norm": 1.927211046218872, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "learning_rate": 0.0002495254204034897, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.5969608426094055, + "step": 19010 + }, + { + "ce_loss": 0.1397782862186432, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.2647230327129364, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.10247133672237396, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.7212368249893188, + "step": 19010 + }, + { + "ce_loss": 0.14784960448741913, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.2756213843822479, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.14659282565116882, + "step": 19010 + }, + { + "epoch": 6.344229486324216, + "loss": 0.6313, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "grad_norm": 2.6346116065979004, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "learning_rate": 0.00024912511987658744, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.6190280318260193, + "step": 19020 + }, + { + "ce_loss": 0.14975640177726746, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.2882692217826843, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.13756629824638367, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.7044951915740967, + "step": 19020 + }, + { + "ce_loss": 0.1121923103928566, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.2728697955608368, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.1473977267742157, + "step": 19020 + }, + { + "epoch": 6.347565043362241, + "loss": 0.6377, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "grad_norm": 1.7768129110336304, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "learning_rate": 0.00024872499543690524, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.7369954586029053, + "step": 19030 + }, + { + "ce_loss": 0.13455504179000854, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.23038426041603088, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.13557906448841095, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.5275669097900391, + "step": 19030 + }, + { + "ce_loss": 0.06495196372270584, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.23524709045886993, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.110582135617733, + "step": 19030 + }, + { + "epoch": 6.350900600400267, + "loss": 0.594, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "grad_norm": 1.8504769802093506, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "learning_rate": 0.00024832504755143114, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.6440969705581665, + "step": 19040 + }, + { + "ce_loss": 0.1938813179731369, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.3125225305557251, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.13713249564170837, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.5212450623512268, + "step": 19040 + }, + { + "ce_loss": 0.1180388480424881, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.28184014558792114, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.08356638252735138, + "step": 19040 + }, + { + "epoch": 6.354236157438292, + "loss": 0.6343, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "grad_norm": 1.3376898765563965, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "learning_rate": 0.0002479252766869476, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 0.46507495641708374, + "step": 19050 + }, + { + "ce_loss": 0.11169010400772095, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.24842992424964905, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.1047307476401329, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 1.4413424730300903, + "step": 19050 + }, + { + "ce_loss": 0.1381653994321823, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.33976855874061584, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.10790744423866272, + "step": 19050 + }, + { + "epoch": 6.357571714476317, + "loss": 0.6824, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "grad_norm": 1.5171325206756592, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "learning_rate": 0.00024752568331003, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.6507919430732727, + "step": 19060 + }, + { + "ce_loss": 0.10762443393468857, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.2979751527309418, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.1204538568854332, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.6927547454833984, + "step": 19060 + }, + { + "ce_loss": 0.17206089198589325, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.4154990613460541, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.10477491468191147, + "step": 19060 + }, + { + "epoch": 6.360907271514343, + "loss": 0.6538, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "grad_norm": 1.3551887273788452, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "learning_rate": 0.0002471262678870469, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.6207748055458069, + "step": 19070 + }, + { + "ce_loss": 0.13332809507846832, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.32056692242622375, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.13821867108345032, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.44919517636299133, + "step": 19070 + }, + { + "ce_loss": 0.05316970869898796, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.24726107716560364, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.10930788516998291, + "step": 19070 + }, + { + "epoch": 6.364242828552368, + "loss": 0.6372, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "grad_norm": 6.933979511260986, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "learning_rate": 0.000246727030884159, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.485307514667511, + "step": 19080 + }, + { + "ce_loss": 0.10739357769489288, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.2198185920715332, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.11307939141988754, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.7215542197227478, + "step": 19080 + }, + { + "ce_loss": 0.2537827491760254, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.30832213163375854, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.15916509926319122, + "step": 19080 + }, + { + "epoch": 6.367578385590393, + "loss": 0.651, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "grad_norm": 1.340692162513733, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "learning_rate": 0.000246327972767319, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.510690450668335, + "step": 19090 + }, + { + "ce_loss": 0.1185477003455162, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.2782058119773865, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.11369862407445908, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.4796235263347626, + "step": 19090 + }, + { + "ce_loss": 0.08776428550481796, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.2346237152814865, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.0737283006310463, + "step": 19090 + }, + { + "epoch": 6.370913942628419, + "loss": 0.6061, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "grad_norm": 1.6497814655303955, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "learning_rate": 0.0002459290940022705, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.5280488133430481, + "step": 19100 + }, + { + "ce_loss": 0.10927163064479828, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.19644902646541595, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.12360698729753494, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.5164461731910706, + "step": 19100 + }, + { + "ce_loss": 0.11721847951412201, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.2527116537094116, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.14623260498046875, + "step": 19100 + }, + { + "epoch": 6.374249499666444, + "loss": 0.6256, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "grad_norm": 3.0138490200042725, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "learning_rate": 0.0002455303950545482, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.6398062109947205, + "step": 19110 + }, + { + "ce_loss": 0.14219415187835693, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.3258860409259796, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.13845248520374298, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.47664421796798706, + "step": 19110 + }, + { + "ce_loss": 0.08407106250524521, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.2631029784679413, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.09293530881404877, + "step": 19110 + }, + { + "epoch": 6.377585056704469, + "loss": 0.5997, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "grad_norm": 1.3262351751327515, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "learning_rate": 0.00024513187638947634, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.600747287273407, + "step": 19120 + }, + { + "ce_loss": 0.169780895113945, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.2887939214706421, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.11096259951591492, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.5601503849029541, + "step": 19120 + }, + { + "ce_loss": 0.11062528938055038, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.23963096737861633, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.11627501249313354, + "step": 19120 + }, + { + "epoch": 6.380920613742495, + "loss": 0.6067, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "grad_norm": 2.5720584392547607, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "learning_rate": 0.00024473353847216927, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.9681487083435059, + "step": 19130 + }, + { + "ce_loss": 0.13055378198623657, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.22957968711853027, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.1326015293598175, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.7964392304420471, + "step": 19130 + }, + { + "ce_loss": 0.23628319799900055, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.41514474153518677, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.11686284840106964, + "step": 19130 + }, + { + "epoch": 6.38425617078052, + "loss": 0.6259, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "grad_norm": 1.1457382440567017, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "learning_rate": 0.00024433538176753, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.47281309962272644, + "step": 19140 + }, + { + "ce_loss": 0.09508227556943893, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.21699288487434387, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.09583717584609985, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.5777966976165771, + "step": 19140 + }, + { + "ce_loss": 0.1204901710152626, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.2619478404521942, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.10629819333553314, + "step": 19140 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.5729, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "grad_norm": 1.4310050010681152, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "learning_rate": 0.00024393740674025054, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.566169023513794, + "step": 19150 + }, + { + "ce_loss": 0.1300867199897766, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.2211119830608368, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.10674677044153214, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.4511871337890625, + "step": 19150 + }, + { + "ce_loss": 0.12526176869869232, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.2322850525379181, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.09337151795625687, + "step": 19150 + }, + { + "epoch": 6.390927284856571, + "loss": 0.6005, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "grad_norm": 1.8587944507598877, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "learning_rate": 0.0002435396138548104, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.7587735056877136, + "step": 19160 + }, + { + "ce_loss": 0.1708332747220993, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.2876565456390381, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.14317108690738678, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.49477240443229675, + "step": 19160 + }, + { + "ce_loss": 0.15645787119865417, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.19042925536632538, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.1474367380142212, + "step": 19160 + }, + { + "epoch": 6.394262841894596, + "loss": 0.6025, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "grad_norm": 1.6388176679611206, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "learning_rate": 0.00024314200357547684, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.5780409574508667, + "step": 19170 + }, + { + "ce_loss": 0.1522267609834671, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.2433975338935852, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.12864993512630463, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.922885000705719, + "step": 19170 + }, + { + "ce_loss": 0.12585236132144928, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.24935932457447052, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.14388763904571533, + "step": 19170 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.6638, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "grad_norm": 1.546163558959961, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "learning_rate": 0.00024274457636630365, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.5836116671562195, + "step": 19180 + }, + { + "ce_loss": 0.12790940701961517, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.2516058087348938, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.10985349118709564, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.3911502957344055, + "step": 19180 + }, + { + "ce_loss": 0.0864064171910286, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.16342835128307343, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.10683442652225494, + "step": 19180 + }, + { + "epoch": 6.400933955970647, + "loss": 0.5883, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "grad_norm": 3.076702833175659, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "learning_rate": 0.00024234733269113128, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.5868853330612183, + "step": 19190 + }, + { + "ce_loss": 0.1519070714712143, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.2489883005619049, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.09567808359861374, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.4550357460975647, + "step": 19190 + }, + { + "ce_loss": 0.1052570566534996, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.25030675530433655, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.09924329817295074, + "step": 19190 + }, + { + "epoch": 6.404269513008672, + "loss": 0.6101, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "grad_norm": 3.8931031227111816, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "learning_rate": 0.00024195027301358572, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.5110546350479126, + "step": 19200 + }, + { + "ce_loss": 0.09627437591552734, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.24332520365715027, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.13109424710273743, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.6022369265556335, + "step": 19200 + }, + { + "ce_loss": 0.11745914071798325, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.22792620956897736, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.11005257070064545, + "step": 19200 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.6063, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "grad_norm": 1.5678305625915527, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "learning_rate": 0.00024155339779707852, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.564750611782074, + "step": 19210 + }, + { + "ce_loss": 0.07955728471279144, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.23164673149585724, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.1055203229188919, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.6098896861076355, + "step": 19210 + }, + { + "ce_loss": 0.14870597422122955, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.2938266098499298, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.13252505660057068, + "step": 19210 + }, + { + "epoch": 6.410940627084723, + "loss": 0.6006, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "grad_norm": 1.7106890678405762, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "learning_rate": 0.00024115670750480552, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.834604799747467, + "step": 19220 + }, + { + "ce_loss": 0.23402222990989685, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.28607699275016785, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.18355098366737366, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.6679419279098511, + "step": 19220 + }, + { + "ce_loss": 0.176821768283844, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.2825266420841217, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.11217405647039413, + "step": 19220 + }, + { + "epoch": 6.414276184122748, + "loss": 0.6123, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "grad_norm": 4.054162502288818, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "learning_rate": 0.00024076020259974722, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.9778499603271484, + "step": 19230 + }, + { + "ce_loss": 0.2200707495212555, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.3484407067298889, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.1907111555337906, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.4964205026626587, + "step": 19230 + }, + { + "ce_loss": 0.08122781664133072, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.20599791407585144, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.16448451578617096, + "step": 19230 + }, + { + "epoch": 6.417611741160774, + "loss": 0.5907, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "grad_norm": 1.3038452863693237, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "learning_rate": 0.00024036388354466728, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.49369338154792786, + "step": 19240 + }, + { + "ce_loss": 0.093342624604702, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.2022968977689743, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.10506366193294525, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.6517524123191833, + "step": 19240 + }, + { + "ce_loss": 0.14238576591014862, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.2635531425476074, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.11066452413797379, + "step": 19240 + }, + { + "epoch": 6.420947298198799, + "loss": 0.5716, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "grad_norm": 1.5243655443191528, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "learning_rate": 0.00023996775080211276, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.6852512359619141, + "step": 19250 + }, + { + "ce_loss": 0.1377100795507431, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.2286660373210907, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.09662225842475891, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.41076192259788513, + "step": 19250 + }, + { + "ce_loss": 0.09102330356836319, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.20911958813667297, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.11036841571331024, + "step": 19250 + }, + { + "epoch": 6.424282855236824, + "loss": 0.6531, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "grad_norm": 1.2440197467803955, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "learning_rate": 0.00023957180483441336, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.46109625697135925, + "step": 19260 + }, + { + "ce_loss": 0.10086578875780106, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.23006482422351837, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.10876964032649994, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.7225244641304016, + "step": 19260 + }, + { + "ce_loss": 0.18343444168567657, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.28087159991264343, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.15784253180027008, + "step": 19260 + }, + { + "epoch": 6.42761841227485, + "loss": 0.6306, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "grad_norm": 2.5110762119293213, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "learning_rate": 0.00023917604610368049, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.5681543946266174, + "step": 19270 + }, + { + "ce_loss": 0.15568585693836212, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.28800246119499207, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.12352965772151947, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.45578742027282715, + "step": 19270 + }, + { + "ce_loss": 0.08018852770328522, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.2554260492324829, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.09290342777967453, + "step": 19270 + }, + { + "epoch": 6.430953969312875, + "loss": 0.5742, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "grad_norm": 1.568680763244629, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "learning_rate": 0.00023878047507180718, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.6375356912612915, + "step": 19280 + }, + { + "ce_loss": 0.19469177722930908, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.29809805750846863, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.11061777174472809, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.6644027829170227, + "step": 19280 + }, + { + "ce_loss": 0.16229896247386932, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.2902613878250122, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.11752331256866455, + "step": 19280 + }, + { + "epoch": 6.4342895263509, + "loss": 0.6654, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "grad_norm": 4.854502201080322, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "learning_rate": 0.0002383850922004674, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.6462293863296509, + "step": 19290 + }, + { + "ce_loss": 0.15397413074970245, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.36222219467163086, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.09376648813486099, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.6333227753639221, + "step": 19290 + }, + { + "ce_loss": 0.12945379316806793, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.3720274567604065, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.09307526051998138, + "step": 19290 + }, + { + "epoch": 6.437625083388926, + "loss": 0.6233, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "grad_norm": 1.6913163661956787, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "learning_rate": 0.00023798989795111556, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.5537348985671997, + "step": 19300 + }, + { + "ce_loss": 0.12098418176174164, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.2599371075630188, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.10150939226150513, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.6065669655799866, + "step": 19300 + }, + { + "ce_loss": 0.13648119568824768, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.3049774765968323, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.14003805816173553, + "step": 19300 + }, + { + "epoch": 6.440960640426951, + "loss": 0.6065, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "grad_norm": 2.4706809520721436, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "learning_rate": 0.0002375948927849857, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.49890419840812683, + "step": 19310 + }, + { + "ce_loss": 0.09926062822341919, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.26944243907928467, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.1298120617866516, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.5722777843475342, + "step": 19310 + }, + { + "ce_loss": 0.12331241369247437, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.3165596127510071, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.09562243521213531, + "step": 19310 + }, + { + "epoch": 6.444296197464976, + "loss": 0.5972, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "grad_norm": 1.4854732751846313, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "learning_rate": 0.0002372000771630916, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.6584548354148865, + "step": 19320 + }, + { + "ce_loss": 0.12285207957029343, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.23300600051879883, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.14293143153190613, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.6606813669204712, + "step": 19320 + }, + { + "ce_loss": 0.163771390914917, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.263140469789505, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.12971723079681396, + "step": 19320 + }, + { + "epoch": 6.447631754503002, + "loss": 0.6658, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "grad_norm": 2.2731552124023438, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "learning_rate": 0.00023680545154622533, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.8783732056617737, + "step": 19330 + }, + { + "ce_loss": 0.1531582623720169, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.2370985746383667, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.10351847857236862, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.33935779333114624, + "step": 19330 + }, + { + "ce_loss": 0.07009600847959518, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.18475466966629028, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.08419663459062576, + "step": 19330 + }, + { + "epoch": 6.450967311541027, + "loss": 0.5768, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "grad_norm": 1.3007348775863647, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "learning_rate": 0.0002364110163949577, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 0.5033159255981445, + "step": 19340 + }, + { + "ce_loss": 0.10342025011777878, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.2683402895927429, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.1313643902540207, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 1.1450793743133545, + "step": 19340 + }, + { + "ce_loss": 0.19282738864421844, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.30856090784072876, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.14692862331867218, + "step": 19340 + }, + { + "epoch": 6.454302868579052, + "loss": 0.608, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "grad_norm": 1.5652614831924438, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "learning_rate": 0.00023601677216963674, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.8717043399810791, + "step": 19350 + }, + { + "ce_loss": 0.18375054001808167, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.2988278269767761, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.14031578600406647, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.47830939292907715, + "step": 19350 + }, + { + "ce_loss": 0.12546370923519135, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.22042587399482727, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.0987526997923851, + "step": 19350 + }, + { + "epoch": 6.457638425617078, + "loss": 0.682, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "grad_norm": 1.4231470823287964, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "learning_rate": 0.0002356227193303879, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.6524989604949951, + "step": 19360 + }, + { + "ce_loss": 0.11222806572914124, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.24415633082389832, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.1500520259141922, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.576420783996582, + "step": 19360 + }, + { + "ce_loss": 0.1236271932721138, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.2536475658416748, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.12630252540111542, + "step": 19360 + }, + { + "epoch": 6.460973982655103, + "loss": 0.5793, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "grad_norm": 1.6094447374343872, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "learning_rate": 0.00023522885833711339, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.5709271430969238, + "step": 19370 + }, + { + "ce_loss": 0.17371134459972382, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.29568082094192505, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.1014719232916832, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.3881910443305969, + "step": 19370 + }, + { + "ce_loss": 0.07997575402259827, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.16086122393608093, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.11414096504449844, + "step": 19370 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.5655, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "grad_norm": 1.544553279876709, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "learning_rate": 0.0002348351896494914, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.7244067192077637, + "step": 19380 + }, + { + "ce_loss": 0.13852331042289734, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.23501816391944885, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.13060513138771057, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.3620413541793823, + "step": 19380 + }, + { + "ce_loss": 0.08747374266386032, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.17260627448558807, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.07209240645170212, + "step": 19380 + }, + { + "epoch": 6.467645096731154, + "loss": 0.5893, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "grad_norm": 1.6576930284500122, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "learning_rate": 0.00023444171372697547, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 0.5739819407463074, + "step": 19390 + }, + { + "ce_loss": 0.133833110332489, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.27732205390930176, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.12894289195537567, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 1.0972142219543457, + "step": 19390 + }, + { + "ce_loss": 0.2051316499710083, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.2552916407585144, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.10956212878227234, + "step": 19390 + }, + { + "epoch": 6.470980653769179, + "loss": 0.6861, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "grad_norm": 1.7194758653640747, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "learning_rate": 0.00023404843102879452, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.6886380910873413, + "step": 19400 + }, + { + "ce_loss": 0.1668749749660492, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.26839739084243774, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.1428913176059723, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.5019069314002991, + "step": 19400 + }, + { + "ce_loss": 0.09855532646179199, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.23468101024627686, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.08430713415145874, + "step": 19400 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.6133, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "grad_norm": 1.5843451023101807, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "learning_rate": 0.0002336553420139516, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.6567657589912415, + "step": 19410 + }, + { + "ce_loss": 0.07436903566122055, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.20002348721027374, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.13849644362926483, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.6510065793991089, + "step": 19410 + }, + { + "ce_loss": 0.1610298454761505, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.3236733376979828, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.1232231929898262, + "step": 19410 + }, + { + "epoch": 6.47765176784523, + "loss": 0.6563, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "grad_norm": 1.6471127271652222, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "learning_rate": 0.0002332624471412241, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.5237685441970825, + "step": 19420 + }, + { + "ce_loss": 0.12244009971618652, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.22729995846748352, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.11839036643505096, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.8403283357620239, + "step": 19420 + }, + { + "ce_loss": 0.15734674036502838, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.24373812973499298, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.12084154784679413, + "step": 19420 + }, + { + "epoch": 6.480987324883255, + "loss": 0.6131, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "grad_norm": 1.3815033435821533, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "learning_rate": 0.00023286974686916235, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.6015914082527161, + "step": 19430 + }, + { + "ce_loss": 0.16842041909694672, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.2824459671974182, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.10923143476247787, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.4720430076122284, + "step": 19430 + }, + { + "ce_loss": 0.13118982315063477, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.21990752220153809, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.12076015025377274, + "step": 19430 + }, + { + "epoch": 6.484322881921281, + "loss": 0.5629, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "grad_norm": 1.0681235790252686, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "learning_rate": 0.00023247724165609, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.7614908814430237, + "step": 19440 + }, + { + "ce_loss": 0.1435561329126358, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.29191237688064575, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.12050356715917587, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.49718719720840454, + "step": 19440 + }, + { + "ce_loss": 0.09759809821844101, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.24840621650218964, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.10465782880783081, + "step": 19440 + }, + { + "epoch": 6.487658438959306, + "loss": 0.612, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "grad_norm": 1.1900967359542847, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "learning_rate": 0.00023208493196010292, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.7231645584106445, + "step": 19450 + }, + { + "ce_loss": 0.1517426073551178, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.2369062751531601, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.116610586643219, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.6371214389801025, + "step": 19450 + }, + { + "ce_loss": 0.1503915637731552, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.26254144310951233, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.11877196282148361, + "step": 19450 + }, + { + "epoch": 6.490993995997331, + "loss": 0.5708, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "grad_norm": 1.619773507118225, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "learning_rate": 0.00023169281823906857, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.3980375826358795, + "step": 19460 + }, + { + "ce_loss": 0.1058371439576149, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.19484549760818481, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.09722219407558441, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.5850816369056702, + "step": 19460 + }, + { + "ce_loss": 0.16564002633094788, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.25721365213394165, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.11461980640888214, + "step": 19460 + }, + { + "epoch": 6.494329553035357, + "loss": 0.5651, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "grad_norm": 1.083567500114441, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "learning_rate": 0.00023130090095062572, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.7174849510192871, + "step": 19470 + }, + { + "ce_loss": 0.11787024140357971, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.2558152675628662, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.13834647834300995, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.40972697734832764, + "step": 19470 + }, + { + "ce_loss": 0.07389426976442337, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.21317392587661743, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.08200264722108841, + "step": 19470 + }, + { + "epoch": 6.497665110073382, + "loss": 0.5995, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "grad_norm": 1.8454265594482422, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "learning_rate": 0.00023090918055218462, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.6001446843147278, + "step": 19480 + }, + { + "ce_loss": 0.16934876143932343, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.2737237215042114, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.1263040155172348, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.48116031289100647, + "step": 19480 + }, + { + "ce_loss": 0.11749963462352753, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.2328428030014038, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.13069671392440796, + "step": 19480 + }, + { + "epoch": 6.501000667111407, + "loss": 0.5274, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "grad_norm": 1.5844311714172363, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "learning_rate": 0.00023051765750092454, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.5427977442741394, + "step": 19490 + }, + { + "ce_loss": 0.1288299262523651, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.24468667805194855, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.13324861228466034, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.6738988161087036, + "step": 19490 + }, + { + "ce_loss": 0.17870159447193146, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.2239747941493988, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.15819533169269562, + "step": 19490 + }, + { + "epoch": 6.504336224149433, + "loss": 0.6324, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "grad_norm": 1.2954641580581665, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "learning_rate": 0.00023012633225379526, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.6101667284965515, + "step": 19500 + }, + { + "ce_loss": 0.1754041314125061, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.2514132559299469, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.11005375534296036, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.5737624764442444, + "step": 19500 + }, + { + "ce_loss": 0.13218827545642853, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.26241040229797363, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.15054892003536224, + "step": 19500 + }, + { + "epoch": 6.507671781187458, + "loss": 0.5694, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "grad_norm": 1.4394662380218506, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "learning_rate": 0.00022973520526751534, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.5179211497306824, + "step": 19510 + }, + { + "ce_loss": 0.14793452620506287, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.23366346955299377, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.11347433179616928, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.44513532519340515, + "step": 19510 + }, + { + "ce_loss": 0.11363400518894196, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.25194841623306274, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.07942268252372742, + "step": 19510 + }, + { + "epoch": 6.511007338225483, + "loss": 0.5855, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "grad_norm": 2.0026657581329346, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "learning_rate": 0.00022934427699857212, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.41633138060569763, + "step": 19520 + }, + { + "ce_loss": 0.08929416537284851, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.18606798350811005, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.10715822130441666, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.617181658744812, + "step": 19520 + }, + { + "ce_loss": 0.1636533886194229, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.24410457909107208, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.10966235399246216, + "step": 19520 + }, + { + "epoch": 6.514342895263509, + "loss": 0.6211, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "grad_norm": 1.1486506462097168, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "learning_rate": 0.00022895354790322122, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.5419558882713318, + "step": 19530 + }, + { + "ce_loss": 0.0822252556681633, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.24467316269874573, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.1084655374288559, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.501002848148346, + "step": 19530 + }, + { + "ce_loss": 0.09863227605819702, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.250343918800354, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.12293490767478943, + "step": 19530 + }, + { + "epoch": 6.517678452301534, + "loss": 0.5737, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "grad_norm": 1.5684605836868286, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "learning_rate": 0.000228563018437485, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.42738595604896545, + "step": 19540 + }, + { + "ce_loss": 0.10307467728853226, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.18302753567695618, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.08789996057748795, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.5575722455978394, + "step": 19540 + }, + { + "ce_loss": 0.10519887506961823, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.1517784595489502, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.07769259810447693, + "step": 19540 + }, + { + "epoch": 6.521014009339559, + "loss": 0.582, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "grad_norm": 1.8316733837127686, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "learning_rate": 0.0002281726890571537, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.5858330726623535, + "step": 19550 + }, + { + "ce_loss": 0.14364883303642273, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.2579484283924103, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.14223860204219818, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.5081810355186462, + "step": 19550 + }, + { + "ce_loss": 0.0988958552479744, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.21971705555915833, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.1278819441795349, + "step": 19550 + }, + { + "epoch": 6.524349566377585, + "loss": 0.5486, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "grad_norm": 1.4172102212905884, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "learning_rate": 0.00022778256021778367, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.5101405382156372, + "step": 19560 + }, + { + "ce_loss": 0.15842664241790771, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.2162400782108307, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.10916368663311005, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.6007856130599976, + "step": 19560 + }, + { + "ce_loss": 0.12771476805210114, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.230327770113945, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.11886480450630188, + "step": 19560 + }, + { + "epoch": 6.52768512341561, + "loss": 0.5405, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "grad_norm": 1.4749727249145508, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "learning_rate": 0.00022739263237469732, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.4140733778476715, + "step": 19570 + }, + { + "ce_loss": 0.12525179982185364, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.18022418022155762, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.10837776958942413, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.6587346792221069, + "step": 19570 + }, + { + "ce_loss": 0.11886288970708847, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.23400680720806122, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.13027873635292053, + "step": 19570 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.6594, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "grad_norm": 2.201953887939453, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "learning_rate": 0.00022700290598298204, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.6995164752006531, + "step": 19580 + }, + { + "ce_loss": 0.1675536185503006, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.347374826669693, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.12447948008775711, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.8357738256454468, + "step": 19580 + }, + { + "ce_loss": 0.16174152493476868, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.3044124245643616, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.13059574365615845, + "step": 19580 + }, + { + "epoch": 6.534356237491661, + "loss": 0.6612, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "grad_norm": 1.8671090602874756, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "learning_rate": 0.0002266133814974909, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.5545954704284668, + "step": 19590 + }, + { + "ce_loss": 0.12362905591726303, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.2608444392681122, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.1253921389579773, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.532317042350769, + "step": 19590 + }, + { + "ce_loss": 0.12566885352134705, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.24860218167304993, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.1408577263355255, + "step": 19590 + }, + { + "epoch": 6.537691794529686, + "loss": 0.5622, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "grad_norm": 1.6233079433441162, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "learning_rate": 0.00022622405937284087, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.6238809823989868, + "step": 19600 + }, + { + "ce_loss": 0.12666717171669006, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.23891451954841614, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.12072195112705231, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.9023510813713074, + "step": 19600 + }, + { + "ce_loss": 0.19780804216861725, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.34094032645225525, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.14249184727668762, + "step": 19600 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.561, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "grad_norm": 2.4079267978668213, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "learning_rate": 0.00022583494006341243, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.4726938307285309, + "step": 19610 + }, + { + "ce_loss": 0.0903717577457428, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.242726668715477, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.1005355715751648, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.5008431673049927, + "step": 19610 + }, + { + "ce_loss": 0.09989278763532639, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.23725369572639465, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.12273890525102615, + "step": 19610 + }, + { + "epoch": 6.544362908605737, + "loss": 0.6709, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "grad_norm": 1.5335849523544312, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "learning_rate": 0.0002254460240233499, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.4559919834136963, + "step": 19620 + }, + { + "ce_loss": 0.11252197623252869, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.21082544326782227, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.10504145175218582, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.6788507103919983, + "step": 19620 + }, + { + "ce_loss": 0.18909797072410583, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.3105465769767761, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.12650969624519348, + "step": 19620 + }, + { + "epoch": 6.547698465643762, + "loss": 0.5758, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "grad_norm": 1.2773675918579102, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "learning_rate": 0.0002250573117065601, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 1.0461361408233643, + "step": 19630 + }, + { + "ce_loss": 0.13800853490829468, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.2769632637500763, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.12022116780281067, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 0.8002582788467407, + "step": 19630 + }, + { + "ce_loss": 0.24044691026210785, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.3901602327823639, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.1429591327905655, + "step": 19630 + }, + { + "epoch": 6.551034022681788, + "loss": 0.6265, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "grad_norm": 1.434065341949463, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "learning_rate": 0.00022466880356671233, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.8791804313659668, + "step": 19640 + }, + { + "ce_loss": 0.1697702407836914, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.2402646541595459, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.14590327441692352, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.5269797444343567, + "step": 19640 + }, + { + "ce_loss": 0.14256873726844788, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.22623835504055023, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.12891146540641785, + "step": 19640 + }, + { + "epoch": 6.554369579719813, + "loss": 0.644, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "grad_norm": 1.3951469659805298, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "learning_rate": 0.0002242805000572371, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.6440078616142273, + "step": 19650 + }, + { + "ce_loss": 0.18327969312667847, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.27862748503685, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.1434606909751892, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.6104453206062317, + "step": 19650 + }, + { + "ce_loss": 0.15171493589878082, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.28208792209625244, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.13384582102298737, + "step": 19650 + }, + { + "epoch": 6.557705136757838, + "loss": 0.6289, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "grad_norm": 2.730189561843872, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "learning_rate": 0.00022389240163132645, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.6251262426376343, + "step": 19660 + }, + { + "ce_loss": 0.1534355878829956, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.2709234952926636, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.11832322925329208, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.7124171853065491, + "step": 19660 + }, + { + "ce_loss": 0.1600910872220993, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.3154120147228241, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.10829029977321625, + "step": 19660 + }, + { + "epoch": 6.561040693795864, + "loss": 0.6438, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "grad_norm": 3.5961387157440186, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "learning_rate": 0.0002235045087419331, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.9230295419692993, + "step": 19670 + }, + { + "ce_loss": 0.16147533059120178, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.30377304553985596, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.12979774177074432, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.5350285172462463, + "step": 19670 + }, + { + "ce_loss": 0.10126829892396927, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.22712314128875732, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.10902617871761322, + "step": 19670 + }, + { + "epoch": 6.564376250833889, + "loss": 0.6751, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "grad_norm": 1.9512437582015991, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "learning_rate": 0.00022311682184176986, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.7731413841247559, + "step": 19680 + }, + { + "ce_loss": 0.1124635860323906, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.2667849659919739, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.11324404180049896, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.52984619140625, + "step": 19680 + }, + { + "ce_loss": 0.1323978304862976, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.23706260323524475, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.12182512879371643, + "step": 19680 + }, + { + "epoch": 6.567711807871914, + "loss": 0.5983, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "grad_norm": 1.7688568830490112, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "learning_rate": 0.00022272934138330865, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.5021007657051086, + "step": 19690 + }, + { + "ce_loss": 0.15027810633182526, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.23919159173965454, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.11207623034715652, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.4850795269012451, + "step": 19690 + }, + { + "ce_loss": 0.1430709809064865, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.22025123238563538, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.09491828083992004, + "step": 19690 + }, + { + "epoch": 6.57104736490994, + "loss": 0.6223, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "grad_norm": 1.7269474267959595, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "learning_rate": 0.00022234206781878126, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.4886223375797272, + "step": 19700 + }, + { + "ce_loss": 0.12366501241922379, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.20032623410224915, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.12621523439884186, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.5551965236663818, + "step": 19700 + }, + { + "ce_loss": 0.16444770991802216, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.25806915760040283, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.1323830783367157, + "step": 19700 + }, + { + "epoch": 6.574382921947965, + "loss": 0.6061, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "grad_norm": 1.507322907447815, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "learning_rate": 0.0002219550016001776, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.5559046268463135, + "step": 19710 + }, + { + "ce_loss": 0.12786805629730225, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.27983155846595764, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.11760944873094559, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.4719151556491852, + "step": 19710 + }, + { + "ce_loss": 0.11807572841644287, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.21849408745765686, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.10040108114480972, + "step": 19710 + }, + { + "epoch": 6.57771847898599, + "loss": 0.6238, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "grad_norm": 2.1423234939575195, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "learning_rate": 0.00022156814317924562, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.810075044631958, + "step": 19720 + }, + { + "ce_loss": 0.17612165212631226, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.32744908332824707, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.19267144799232483, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.6173214912414551, + "step": 19720 + }, + { + "ce_loss": 0.1226503923535347, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.25212758779525757, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.13104918599128723, + "step": 19720 + }, + { + "epoch": 6.581054036024016, + "loss": 0.6279, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "grad_norm": 1.9452593326568604, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "learning_rate": 0.00022118149300749047, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.6032732725143433, + "step": 19730 + }, + { + "ce_loss": 0.14312177896499634, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.2730581760406494, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.1491086333990097, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.6786301136016846, + "step": 19730 + }, + { + "ce_loss": 0.13837352395057678, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.2833654582500458, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.12843720614910126, + "step": 19730 + }, + { + "epoch": 6.584389593062041, + "loss": 0.6008, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "grad_norm": 1.8112263679504395, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "learning_rate": 0.00022079505153617466, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.6665618419647217, + "step": 19740 + }, + { + "ce_loss": 0.14172880351543427, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.2878859341144562, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.15004181861877441, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.4591222107410431, + "step": 19740 + }, + { + "ce_loss": 0.09805697947740555, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.23241685330867767, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.0994115024805069, + "step": 19740 + }, + { + "epoch": 6.587725150100066, + "loss": 0.5594, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "grad_norm": 1.4407625198364258, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "learning_rate": 0.00022040881921631692, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.6755169034004211, + "step": 19750 + }, + { + "ce_loss": 0.18521958589553833, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.3709476590156555, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.1117776557803154, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.7607985734939575, + "step": 19750 + }, + { + "ce_loss": 0.12100774049758911, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.32476189732551575, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.09815852344036102, + "step": 19750 + }, + { + "epoch": 6.591060707138092, + "loss": 0.5864, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "grad_norm": 1.4858148097991943, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "learning_rate": 0.00022002279649869214, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.6402103900909424, + "step": 19760 + }, + { + "ce_loss": 0.10261153429746628, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.18682360649108887, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.08958607912063599, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.5752401947975159, + "step": 19760 + }, + { + "ce_loss": 0.13084650039672852, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.27911341190338135, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.09355991333723068, + "step": 19760 + }, + { + "epoch": 6.594396264176117, + "loss": 0.6532, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "grad_norm": 1.2409772872924805, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "learning_rate": 0.00021963698383383005, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.5927982330322266, + "step": 19770 + }, + { + "ce_loss": 0.1433178186416626, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.302937388420105, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.14607828855514526, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.5983748435974121, + "step": 19770 + }, + { + "ce_loss": 0.16430151462554932, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.23618659377098083, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.1571560800075531, + "step": 19770 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.6228, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "grad_norm": 1.5878171920776367, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "learning_rate": 0.00021925138167201564, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.701454222202301, + "step": 19780 + }, + { + "ce_loss": 0.1458064168691635, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.2965890169143677, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.12389638274908066, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.636038601398468, + "step": 19780 + }, + { + "ce_loss": 0.17314723134040833, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.31819021701812744, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.14442434906959534, + "step": 19780 + }, + { + "epoch": 6.601067378252168, + "loss": 0.6227, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "grad_norm": 2.4692835807800293, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "learning_rate": 0.00021886599046328824, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.656324565410614, + "step": 19790 + }, + { + "ce_loss": 0.15108564496040344, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.28433552384376526, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.11606331914663315, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.6400468349456787, + "step": 19790 + }, + { + "ce_loss": 0.1558593362569809, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.34761950373649597, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.1162491887807846, + "step": 19790 + }, + { + "epoch": 6.604402935290193, + "loss": 0.6978, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "grad_norm": 3.7015187740325928, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "learning_rate": 0.00021848081065744076, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.9426319599151611, + "step": 19800 + }, + { + "ce_loss": 0.08227524161338806, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.24910229444503784, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.10845942050218582, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.5317683219909668, + "step": 19800 + }, + { + "ce_loss": 0.11848925799131393, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.2813676595687866, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.1314525604248047, + "step": 19800 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.6737, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "grad_norm": 3.4987027645111084, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "learning_rate": 0.0002180958427040195, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.5265316367149353, + "step": 19810 + }, + { + "ce_loss": 0.11821822822093964, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.28454506397247314, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.1234145313501358, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.7258698344230652, + "step": 19810 + }, + { + "ce_loss": 0.19357775151729584, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.33576950430870056, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.1354198008775711, + "step": 19810 + }, + { + "epoch": 6.611074049366244, + "loss": 0.6058, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "grad_norm": 4.5951972007751465, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "learning_rate": 0.00021771108705232356, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.45154914259910583, + "step": 19820 + }, + { + "ce_loss": 0.13545911014080048, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.22132493555545807, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.09376255422830582, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.5493002533912659, + "step": 19820 + }, + { + "ce_loss": 0.12436985969543457, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.30755358934402466, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.11703960597515106, + "step": 19820 + }, + { + "epoch": 6.614409606404269, + "loss": 0.6026, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "grad_norm": 1.9668054580688477, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "learning_rate": 0.00021732654415140425, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.5193825364112854, + "step": 19830 + }, + { + "ce_loss": 0.13210050761699677, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.29486167430877686, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.09222698956727982, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.5412237644195557, + "step": 19830 + }, + { + "ce_loss": 0.11810997873544693, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.27205947041511536, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.12044981867074966, + "step": 19830 + }, + { + "epoch": 6.617745163442295, + "loss": 0.5936, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "grad_norm": 1.418731689453125, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "learning_rate": 0.00021694221445006426, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 0.44266384840011597, + "step": 19840 + }, + { + "ce_loss": 0.11144088953733444, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.20961447060108185, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.0956166684627533, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 1.132645845413208, + "step": 19840 + }, + { + "ce_loss": 0.15278828144073486, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.2940169870853424, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.1412506103515625, + "step": 19840 + }, + { + "epoch": 6.62108072048032, + "loss": 0.5931, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "grad_norm": 2.844691276550293, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "learning_rate": 0.00021655809839685782, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.6301782131195068, + "step": 19850 + }, + { + "ce_loss": 0.15369366109371185, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.25507915019989014, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.10438395291566849, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.7400352954864502, + "step": 19850 + }, + { + "ce_loss": 0.16687007248401642, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.2876301407814026, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.11608906835317612, + "step": 19850 + }, + { + "epoch": 6.624416277518345, + "loss": 0.5765, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "grad_norm": 1.8998688459396362, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "learning_rate": 0.00021617419644008972, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.6107144951820374, + "step": 19860 + }, + { + "ce_loss": 0.11949118226766586, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.22605156898498535, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.11626297235488892, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.5601349472999573, + "step": 19860 + }, + { + "ce_loss": 0.1702495664358139, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.2407875508069992, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.11027728021144867, + "step": 19860 + }, + { + "epoch": 6.627751834556371, + "loss": 0.6048, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "grad_norm": 1.0039499998092651, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "learning_rate": 0.00021579050902781498, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.5014527440071106, + "step": 19870 + }, + { + "ce_loss": 0.12715910375118256, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.23520439863204956, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.11276977509260178, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.5735328793525696, + "step": 19870 + }, + { + "ce_loss": 0.14892864227294922, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.24366481602191925, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.11686741560697556, + "step": 19870 + }, + { + "epoch": 6.631087391594396, + "loss": 0.6034, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "grad_norm": 2.24519419670105, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "learning_rate": 0.00021540703660783783, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.7075437307357788, + "step": 19880 + }, + { + "ce_loss": 0.12260802090167999, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.2838866710662842, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.11997436732053757, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.6045704483985901, + "step": 19880 + }, + { + "ce_loss": 0.16746434569358826, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.28688085079193115, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.12655295431613922, + "step": 19880 + }, + { + "epoch": 6.634422948632421, + "loss": 0.6433, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "grad_norm": 1.102996826171875, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "learning_rate": 0.00021502377962771198, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.6029108762741089, + "step": 19890 + }, + { + "ce_loss": 0.1609131395816803, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.2370425909757614, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.1219489723443985, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.6594313979148865, + "step": 19890 + }, + { + "ce_loss": 0.19880236685276031, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.30750787258148193, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.12001223862171173, + "step": 19890 + }, + { + "epoch": 6.637758505670447, + "loss": 0.6094, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "grad_norm": 0.9801173210144043, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "learning_rate": 0.0002146407385347396, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.5906522870063782, + "step": 19900 + }, + { + "ce_loss": 0.11420835554599762, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.2585413157939911, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.15924397110939026, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.5947010517120361, + "step": 19900 + }, + { + "ce_loss": 0.1298929899930954, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.2795071601867676, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.14532022178173065, + "step": 19900 + }, + { + "epoch": 6.641094062708472, + "loss": 0.624, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "grad_norm": 2.43601131439209, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "learning_rate": 0.00021425791377597072, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.5134724974632263, + "step": 19910 + }, + { + "ce_loss": 0.14351071417331696, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.22447390854358673, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.14510710537433624, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.5465179085731506, + "step": 19910 + }, + { + "ce_loss": 0.12087228894233704, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.19813315570354462, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.1294873207807541, + "step": 19910 + }, + { + "epoch": 6.644429619746497, + "loss": 0.6308, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "grad_norm": 2.103402614593506, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "learning_rate": 0.0002138753057982033, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 0.5028867721557617, + "step": 19920 + }, + { + "ce_loss": 0.11792866885662079, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.23079612851142883, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.1540123075246811, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 1.1755242347717285, + "step": 19920 + }, + { + "ce_loss": 0.16438809037208557, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.2752496898174286, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.10607973486185074, + "step": 19920 + }, + { + "epoch": 6.647765176784523, + "loss": 0.5982, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "grad_norm": 1.9858911037445068, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "learning_rate": 0.00021349291504798177, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.5443336963653564, + "step": 19930 + }, + { + "ce_loss": 0.15421469509601593, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.2539355754852295, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.09973197430372238, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.6012151837348938, + "step": 19930 + }, + { + "ce_loss": 0.14680509269237518, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.29390209913253784, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.11073038727045059, + "step": 19930 + }, + { + "epoch": 6.651100733822548, + "loss": 0.6459, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "grad_norm": 1.4681084156036377, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "learning_rate": 0.00021311074197159736, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.9424260854721069, + "step": 19940 + }, + { + "ce_loss": 0.13379743695259094, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.2306695282459259, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.11070249229669571, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.47121453285217285, + "step": 19940 + }, + { + "ce_loss": 0.0640735775232315, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.21163296699523926, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.10320694744586945, + "step": 19940 + }, + { + "epoch": 6.654436290860573, + "loss": 0.6308, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "grad_norm": 2.588499069213867, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "learning_rate": 0.00021272878701508735, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.6308241486549377, + "step": 19950 + }, + { + "ce_loss": 0.19566896557807922, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.2965598702430725, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.11354543268680573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.42993810772895813, + "step": 19950 + }, + { + "ce_loss": 0.08403245359659195, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.21457430720329285, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.08863157033920288, + "step": 19950 + }, + { + "epoch": 6.657771847898599, + "loss": 0.569, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "grad_norm": 1.2746546268463135, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "learning_rate": 0.000212347050624234, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.5694778561592102, + "step": 19960 + }, + { + "ce_loss": 0.1148284450173378, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.30706483125686646, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.10699665546417236, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.5459879636764526, + "step": 19960 + }, + { + "ce_loss": 0.11935421824455261, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.22676879167556763, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.0830860361456871, + "step": 19960 + }, + { + "epoch": 6.661107404936624, + "loss": 0.6038, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "grad_norm": 2.4813232421875, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "learning_rate": 0.00021196553324456482, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.7138746976852417, + "step": 19970 + }, + { + "ce_loss": 0.214167058467865, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.3329528570175171, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.1282276213169098, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.4795456528663635, + "step": 19970 + }, + { + "ce_loss": 0.13675269484519958, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.22916845977306366, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.11339738219976425, + "step": 19970 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.6919, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "grad_norm": 1.3644931316375732, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "learning_rate": 0.0002115842353213517, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.5393396615982056, + "step": 19980 + }, + { + "ce_loss": 0.10573761910200119, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.2217199206352234, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.09572423249483109, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.4343937039375305, + "step": 19980 + }, + { + "ce_loss": 0.0948638767004013, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.2013542652130127, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.10309243202209473, + "step": 19980 + }, + { + "epoch": 6.667778519012675, + "loss": 0.6172, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "grad_norm": 1.7576813697814941, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "learning_rate": 0.0002112031572996105, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.7140644788742065, + "step": 19990 + }, + { + "ce_loss": 0.2293837070465088, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.3485688865184784, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.1356680691242218, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.6210101246833801, + "step": 19990 + }, + { + "ce_loss": 0.16044512391090393, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.2541531026363373, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.13174085319042206, + "step": 19990 + }, + { + "epoch": 6.6711140760507, + "loss": 0.6575, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "grad_norm": 2.1928982734680176, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "learning_rate": 0.00021082229962409997, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.5633940696716309, + "step": 20000 + }, + { + "ce_loss": 0.19303011894226074, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.2460951954126358, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.12412123382091522, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.39880824089050293, + "step": 20000 + }, + { + "ce_loss": 0.08768723905086517, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.1907358467578888, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.12010857462882996, + "step": 20000 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.5422, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "grad_norm": 1.323743224143982, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "learning_rate": 0.00021044166273932212, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.6675091981887817, + "step": 20010 + }, + { + "ce_loss": 0.1589205414056778, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.24982638657093048, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.158026322722435, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.42034339904785156, + "step": 20010 + }, + { + "ce_loss": 0.09510764479637146, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.2156810611486435, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.10906819999217987, + "step": 20010 + }, + { + "epoch": 6.677785190126751, + "loss": 0.5977, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "grad_norm": 1.453233003616333, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "learning_rate": 0.00021006124708952117, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.5579141974449158, + "step": 20020 + }, + { + "ce_loss": 0.15179507434368134, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.28201889991760254, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.12390779703855515, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.5579657554626465, + "step": 20020 + }, + { + "ce_loss": 0.10925626009702682, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.33345967531204224, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.09767402708530426, + "step": 20020 + }, + { + "epoch": 6.681120747164776, + "loss": 0.5548, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "grad_norm": 1.407572865486145, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "learning_rate": 0.00020968105311868312, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.7544924020767212, + "step": 20030 + }, + { + "ce_loss": 0.15766873955726624, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.4101894497871399, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.14876404404640198, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.6088945865631104, + "step": 20030 + }, + { + "ce_loss": 0.09153804928064346, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.22568279504776, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.10213732719421387, + "step": 20030 + }, + { + "epoch": 6.684456304202802, + "loss": 0.5674, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "grad_norm": 1.58797025680542, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "learning_rate": 0.00020930108127053526, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.4611407518386841, + "step": 20040 + }, + { + "ce_loss": 0.11111941933631897, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.2381780445575714, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.08858481049537659, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.5269935131072998, + "step": 20040 + }, + { + "ce_loss": 0.12398573756217957, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.20361027121543884, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.10248218476772308, + "step": 20040 + }, + { + "epoch": 6.687791861240827, + "loss": 0.5685, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "grad_norm": 2.0134479999542236, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "learning_rate": 0.0002089213319885456, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.41977277398109436, + "step": 20050 + }, + { + "ce_loss": 0.07606784999370575, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.1722736656665802, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.08966904133558273, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.8288323879241943, + "step": 20050 + }, + { + "ce_loss": 0.20224134624004364, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.2906288802623749, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.1143127977848053, + "step": 20050 + }, + { + "epoch": 6.691127418278852, + "loss": 0.5795, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "grad_norm": 2.094135046005249, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "learning_rate": 0.00020854180571592244, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.6016823053359985, + "step": 20060 + }, + { + "ce_loss": 0.10965229570865631, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.26941967010498047, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.11607593297958374, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.5134243369102478, + "step": 20060 + }, + { + "ce_loss": 0.12020660936832428, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.22750511765480042, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.1258605271577835, + "step": 20060 + }, + { + "epoch": 6.694462975316878, + "loss": 0.6106, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "grad_norm": 2.0959811210632324, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "learning_rate": 0.00020816250289561387, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.4728214144706726, + "step": 20070 + }, + { + "ce_loss": 0.1056213229894638, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.2268831729888916, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.10493393987417221, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.7689107060432434, + "step": 20070 + }, + { + "ce_loss": 0.15808147192001343, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.30683696269989014, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.10166820138692856, + "step": 20070 + }, + { + "epoch": 6.697798532354903, + "loss": 0.6243, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "grad_norm": 2.2686753273010254, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "learning_rate": 0.00020778342397030693, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.639522910118103, + "step": 20080 + }, + { + "ce_loss": 0.1620551347732544, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.3112872540950775, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.1349463313817978, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.5782184600830078, + "step": 20080 + }, + { + "ce_loss": 0.14392095804214478, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.2486431896686554, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.1579214185476303, + "step": 20080 + }, + { + "epoch": 6.701134089392928, + "loss": 0.6096, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "grad_norm": 1.7706823348999023, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "learning_rate": 0.0002074045693824275, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.7955951690673828, + "step": 20090 + }, + { + "ce_loss": 0.10051006078720093, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.2936019003391266, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.11262084543704987, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.4961718022823334, + "step": 20090 + }, + { + "ce_loss": 0.1312670260667801, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.2476268708705902, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.09604895859956741, + "step": 20090 + }, + { + "epoch": 6.704469646430954, + "loss": 0.6018, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "grad_norm": 1.0696649551391602, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "learning_rate": 0.00020702593957413971, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.5171288847923279, + "step": 20100 + }, + { + "ce_loss": 0.16199396550655365, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.2498839944601059, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.10504694283008575, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.6891152858734131, + "step": 20100 + }, + { + "ce_loss": 0.16890086233615875, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.27810660004615784, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.13627681136131287, + "step": 20100 + }, + { + "epoch": 6.707805203468979, + "loss": 0.636, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "grad_norm": 1.3927689790725708, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "learning_rate": 0.00020664753498734554, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.44129008054733276, + "step": 20110 + }, + { + "ce_loss": 0.10400844365358353, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.2058953046798706, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.1013561561703682, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.5037030577659607, + "step": 20110 + }, + { + "ce_loss": 0.140580952167511, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.20188310742378235, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.13261297345161438, + "step": 20110 + }, + { + "epoch": 6.711140760507004, + "loss": 0.5781, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "grad_norm": 1.409254550933838, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "learning_rate": 0.00020626935606368342, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.45771175622940063, + "step": 20120 + }, + { + "ce_loss": 0.08707324415445328, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.2113952487707138, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.11662886291742325, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.6429270505905151, + "step": 20120 + }, + { + "ce_loss": 0.10872305184602737, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.2824891209602356, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.12237393110990524, + "step": 20120 + }, + { + "epoch": 6.71447631754503, + "loss": 0.6371, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "grad_norm": 1.7113875150680542, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "learning_rate": 0.0002058914032445289, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.5755742192268372, + "step": 20130 + }, + { + "ce_loss": 0.15556345880031586, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.23803101480007172, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.1206558421254158, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.6326802372932434, + "step": 20130 + }, + { + "ce_loss": 0.1521034687757492, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.32943302392959595, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.12123904377222061, + "step": 20130 + }, + { + "epoch": 6.717811874583055, + "loss": 0.6306, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "grad_norm": 1.4116154909133911, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "learning_rate": 0.00020551367697099404, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.6386064291000366, + "step": 20140 + }, + { + "ce_loss": 0.1630381941795349, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.359139621257782, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.11624115705490112, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.7267446517944336, + "step": 20140 + }, + { + "ce_loss": 0.09767545014619827, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.2750421464443207, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.11414022743701935, + "step": 20140 + }, + { + "epoch": 6.72114743162108, + "loss": 0.6079, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "grad_norm": 1.8769910335540771, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "learning_rate": 0.00020513617768392562, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.6491705775260925, + "step": 20150 + }, + { + "ce_loss": 0.08372032642364502, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.2563663721084595, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.10501579195261002, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.7487512230873108, + "step": 20150 + }, + { + "ce_loss": 0.1768907606601715, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.263735830783844, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.1748792678117752, + "step": 20150 + }, + { + "epoch": 6.724482988659106, + "loss": 0.5921, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "grad_norm": 1.0785341262817383, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "learning_rate": 0.00020475890582390607, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.6199877262115479, + "step": 20160 + }, + { + "ce_loss": 0.14046601951122284, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.25729498267173767, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.1453024297952652, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.5459374785423279, + "step": 20160 + }, + { + "ce_loss": 0.12143059819936752, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.26327502727508545, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.12969057261943817, + "step": 20160 + }, + { + "epoch": 6.727818545697131, + "loss": 0.6833, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "grad_norm": 4.0255961418151855, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "learning_rate": 0.0002043818618312522, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.5204952955245972, + "step": 20170 + }, + { + "ce_loss": 0.12066006660461426, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.2361370325088501, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.13006658852100372, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.6388102173805237, + "step": 20170 + }, + { + "ce_loss": 0.1897311508655548, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.31053489446640015, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.11395397782325745, + "step": 20170 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.5882, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "grad_norm": 2.0025501251220703, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "learning_rate": 0.00020400504614601515, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.6728408336639404, + "step": 20180 + }, + { + "ce_loss": 0.18863923847675323, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.32389456033706665, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.15970559418201447, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.545183539390564, + "step": 20180 + }, + { + "ce_loss": 0.11113359034061432, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.23649749159812927, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.12272656708955765, + "step": 20180 + }, + { + "epoch": 6.734489659773182, + "loss": 0.6417, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "grad_norm": 1.689151644706726, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "learning_rate": 0.00020362845920797898, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.6169806718826294, + "step": 20190 + }, + { + "ce_loss": 0.1869346797466278, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.2971252501010895, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.10956262052059174, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.43868401646614075, + "step": 20190 + }, + { + "ce_loss": 0.11213511228561401, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.22156301140785217, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.10480619966983795, + "step": 20190 + }, + { + "epoch": 6.737825216811207, + "loss": 0.5495, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "grad_norm": 1.1433922052383423, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "learning_rate": 0.0002032521014566614, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.6528211236000061, + "step": 20200 + }, + { + "ce_loss": 0.16544964909553528, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.2605472803115845, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.12842920422554016, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.6164642572402954, + "step": 20200 + }, + { + "ce_loss": 0.1673274040222168, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.2415827512741089, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.128843292593956, + "step": 20200 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.5752, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "grad_norm": 1.4512262344360352, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "learning_rate": 0.00020287597333131232, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.5911370515823364, + "step": 20210 + }, + { + "ce_loss": 0.10620848089456558, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.28432372212409973, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.11449485272169113, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.5357170104980469, + "step": 20210 + }, + { + "ce_loss": 0.12888075411319733, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.1979178935289383, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.1619427651166916, + "step": 20210 + }, + { + "epoch": 6.744496330887258, + "loss": 0.5962, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "grad_norm": 1.2024363279342651, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "learning_rate": 0.000202500075270914, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.3185579776763916, + "step": 20220 + }, + { + "ce_loss": 0.03577226400375366, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.1970852166414261, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.06377988308668137, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.4299435019493103, + "step": 20220 + }, + { + "ce_loss": 0.08698124438524246, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.2009291648864746, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.08196726441383362, + "step": 20220 + }, + { + "epoch": 6.747831887925283, + "loss": 0.5674, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "grad_norm": 3.3245155811309814, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "learning_rate": 0.00020212440771417956, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.5869963765144348, + "step": 20230 + }, + { + "ce_loss": 0.16733090579509735, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.29322487115859985, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.12613563239574432, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.5164973735809326, + "step": 20230 + }, + { + "ce_loss": 0.11459538340568542, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.22587640583515167, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.0881464034318924, + "step": 20230 + }, + { + "epoch": 6.751167444963309, + "loss": 0.6506, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "grad_norm": 1.2759590148925781, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "learning_rate": 0.00020174897109955338, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.6824588179588318, + "step": 20240 + }, + { + "ce_loss": 0.10708875209093094, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.37594008445739746, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.1465224176645279, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.5953068733215332, + "step": 20240 + }, + { + "ce_loss": 0.08849772065877914, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.26119664311408997, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.11912015080451965, + "step": 20240 + }, + { + "epoch": 6.754503002001334, + "loss": 0.6006, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "grad_norm": 1.7669841051101685, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "learning_rate": 0.00020137376586521085, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.6398548483848572, + "step": 20250 + }, + { + "ce_loss": 0.16085104644298553, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.26059287786483765, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.11758947372436523, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.5975475907325745, + "step": 20250 + }, + { + "ce_loss": 0.14858491718769073, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.23770475387573242, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.10475146770477295, + "step": 20250 + }, + { + "epoch": 6.757838559039359, + "loss": 0.7459, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "grad_norm": 1.4378308057785034, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "learning_rate": 0.00020099879244905676, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.46980538964271545, + "step": 20260 + }, + { + "ce_loss": 0.10932157188653946, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.26287272572517395, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.09724202007055283, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.44305968284606934, + "step": 20260 + }, + { + "ce_loss": 0.07696262001991272, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.24827659130096436, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.11764057725667953, + "step": 20260 + }, + { + "epoch": 6.761174116077385, + "loss": 0.6272, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "grad_norm": 1.1700968742370605, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "learning_rate": 0.0002006240512887251, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.5167644023895264, + "step": 20270 + }, + { + "ce_loss": 0.13465654850006104, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.25756293535232544, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.09358154237270355, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.6173653602600098, + "step": 20270 + }, + { + "ce_loss": 0.1697811782360077, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.2595403790473938, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.11999509483575821, + "step": 20270 + }, + { + "epoch": 6.76450967311541, + "loss": 0.5765, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "grad_norm": 1.1367998123168945, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "learning_rate": 0.0002002495428215794, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.4597811698913574, + "step": 20280 + }, + { + "ce_loss": 0.10254833847284317, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.23351740837097168, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.12356647849082947, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.7911397218704224, + "step": 20280 + }, + { + "ce_loss": 0.1131599172949791, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.26167136430740356, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.12388217449188232, + "step": 20280 + }, + { + "epoch": 6.767845230153435, + "loss": 0.5609, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "grad_norm": 1.8873783349990845, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "learning_rate": 0.0001998752674847112, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.6501887440681458, + "step": 20290 + }, + { + "ce_loss": 0.18052825331687927, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.2731967568397522, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.14176741242408752, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.7093862295150757, + "step": 20290 + }, + { + "ce_loss": 0.16083455085754395, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.24816177785396576, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.15041442215442657, + "step": 20290 + }, + { + "epoch": 6.771180787191461, + "loss": 0.6082, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "grad_norm": 2.3813726902008057, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "learning_rate": 0.00019950122571494038, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.5191268920898438, + "step": 20300 + }, + { + "ce_loss": 0.1417926847934723, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.25416481494903564, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.12228623032569885, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.7120752334594727, + "step": 20300 + }, + { + "ce_loss": 0.10820872336626053, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.21667304635047913, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.12783846259117126, + "step": 20300 + }, + { + "epoch": 6.774516344229486, + "loss": 0.6154, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "grad_norm": 1.7450374364852905, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "learning_rate": 0.00019912741794881348, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.42318326234817505, + "step": 20310 + }, + { + "ce_loss": 0.0868317186832428, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.2112046629190445, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.10673084110021591, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.5567744374275208, + "step": 20310 + }, + { + "ce_loss": 0.1336977630853653, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.23806677758693695, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.11441931873559952, + "step": 20310 + }, + { + "epoch": 6.777851901267511, + "loss": 0.5744, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "grad_norm": 1.237932801246643, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "learning_rate": 0.00019875384462260466, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.34857824444770813, + "step": 20320 + }, + { + "ce_loss": 0.062375832349061966, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.19724702835083008, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.08858238160610199, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.5977160930633545, + "step": 20320 + }, + { + "ce_loss": 0.16513806581497192, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.2866596281528473, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.08822454512119293, + "step": 20320 + }, + { + "epoch": 6.781187458305537, + "loss": 0.5947, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "grad_norm": 1.3662400245666504, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "learning_rate": 0.00019838050617231417, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.7048400640487671, + "step": 20330 + }, + { + "ce_loss": 0.1667744368314743, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.27122920751571655, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.12117403745651245, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.835236668586731, + "step": 20330 + }, + { + "ce_loss": 0.20711590349674225, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.27225542068481445, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.17776216566562653, + "step": 20330 + }, + { + "epoch": 6.784523015343562, + "loss": 0.5946, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "grad_norm": 2.784146785736084, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "learning_rate": 0.00019800740303366822, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.5680144429206848, + "step": 20340 + }, + { + "ce_loss": 0.15996579825878143, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.25848570466041565, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.11828726530075073, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.706413745880127, + "step": 20340 + }, + { + "ce_loss": 0.11593703180551529, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.2646014392375946, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.10727044194936752, + "step": 20340 + }, + { + "epoch": 6.787858572381587, + "loss": 0.5514, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "grad_norm": 1.3041218519210815, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "learning_rate": 0.00019763453564211795, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 0.4949185252189636, + "step": 20350 + }, + { + "ce_loss": 0.13301748037338257, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.24916112422943115, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.11259014904499054, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 0.6650046706199646, + "step": 20350 + }, + { + "ce_loss": 0.10717565566301346, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.2219514697790146, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.10976817458868027, + "step": 20350 + }, + { + "epoch": 6.791194129419613, + "loss": 0.6297, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "grad_norm": 1.8092106580734253, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "learning_rate": 0.00019726190443284018, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.4623590111732483, + "step": 20360 + }, + { + "ce_loss": 0.09109138697385788, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.22240050137043, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.11351132392883301, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.5489964485168457, + "step": 20360 + }, + { + "ce_loss": 0.12590204179286957, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.2594718933105469, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.13404986262321472, + "step": 20360 + }, + { + "epoch": 6.794529686457638, + "loss": 0.6037, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "grad_norm": 1.3519389629364014, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "learning_rate": 0.00019688950984073563, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.47837817668914795, + "step": 20370 + }, + { + "ce_loss": 0.09936484694480896, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.2567150592803955, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.1220482885837555, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.5348929762840271, + "step": 20370 + }, + { + "ce_loss": 0.14279955625534058, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.25085532665252686, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.11229889839887619, + "step": 20370 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.608, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "grad_norm": 2.1077394485473633, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "learning_rate": 0.00019651735230042852, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.618941605091095, + "step": 20380 + }, + { + "ce_loss": 0.12865251302719116, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.22706618905067444, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.10655650496482849, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.5625501275062561, + "step": 20380 + }, + { + "ce_loss": 0.09577847272157669, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.2644766569137573, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.14019955694675446, + "step": 20380 + }, + { + "epoch": 6.801200800533689, + "loss": 0.6264, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "grad_norm": 1.5742781162261963, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "learning_rate": 0.00019614543224626688, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.5375264883041382, + "step": 20390 + }, + { + "ce_loss": 0.11992277204990387, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.21302179992198944, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.10849941521883011, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.9217984676361084, + "step": 20390 + }, + { + "ce_loss": 0.1319456547498703, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.23134317994117737, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.10734369605779648, + "step": 20390 + }, + { + "epoch": 6.804536357571714, + "loss": 0.5953, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "grad_norm": 1.4438633918762207, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "learning_rate": 0.00019577375011232154, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 0.5196937322616577, + "step": 20400 + }, + { + "ce_loss": 0.08157113939523697, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.2544917166233063, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.1237298771739006, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 1.1017556190490723, + "step": 20400 + }, + { + "ce_loss": 0.1311476230621338, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.364717960357666, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.1520640254020691, + "step": 20400 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.6329, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "grad_norm": 1.2128087282180786, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "learning_rate": 0.0001954023063323854, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.5847972631454468, + "step": 20410 + }, + { + "ce_loss": 0.13812997937202454, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.2738398313522339, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.13805897533893585, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.7715336084365845, + "step": 20410 + }, + { + "ce_loss": 0.10610752552747726, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.24427151679992676, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.10848834365606308, + "step": 20410 + }, + { + "epoch": 6.811207471647765, + "loss": 0.6139, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "grad_norm": 2.0211548805236816, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "learning_rate": 0.00019503110133997357, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.6625146269798279, + "step": 20420 + }, + { + "ce_loss": 0.12832562625408173, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.2269144058227539, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.11166250705718994, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.5218786597251892, + "step": 20420 + }, + { + "ce_loss": 0.1277444213628769, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.1871224343776703, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.13162465393543243, + "step": 20420 + }, + { + "epoch": 6.81454302868579, + "loss": 0.5679, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "grad_norm": 1.8764227628707886, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "learning_rate": 0.00019466013556832193, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.5578723549842834, + "step": 20430 + }, + { + "ce_loss": 0.14493246376514435, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.270059734582901, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.11068219691514969, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.5604507923126221, + "step": 20430 + }, + { + "ce_loss": 0.16456195712089539, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.23753252625465393, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.135604128241539, + "step": 20430 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.6251, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "grad_norm": 2.8086681365966797, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "learning_rate": 0.0001942894094503875, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.4727604389190674, + "step": 20440 + }, + { + "ce_loss": 0.08744503557682037, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.22879493236541748, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.12131603062152863, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.6767879724502563, + "step": 20440 + }, + { + "ce_loss": 0.0807797983288765, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.25815528631210327, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.09768194705247879, + "step": 20440 + }, + { + "epoch": 6.821214142761841, + "loss": 0.6603, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "grad_norm": 2.1304433345794678, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "learning_rate": 0.00019391892341884766, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.4594484865665436, + "step": 20450 + }, + { + "ce_loss": 0.10429593920707703, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.22468678653240204, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.0894937589764595, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.5158959627151489, + "step": 20450 + }, + { + "ce_loss": 0.10562293976545334, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.25338807702064514, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.1233699768781662, + "step": 20450 + }, + { + "epoch": 6.824549699799866, + "loss": 0.5414, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "grad_norm": 1.297013759613037, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "learning_rate": 0.0001935486779060994, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.4934908449649811, + "step": 20460 + }, + { + "ce_loss": 0.11889277398586273, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.24133098125457764, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.09454958140850067, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.5913838744163513, + "step": 20460 + }, + { + "ce_loss": 0.11880073696374893, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.23581376671791077, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.11633745580911636, + "step": 20460 + }, + { + "epoch": 6.827885256837892, + "loss": 0.597, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "grad_norm": 2.1039810180664062, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "learning_rate": 0.00019317867334425913, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 0.5328623652458191, + "step": 20470 + }, + { + "ce_loss": 0.13337953388690948, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.20354673266410828, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.1669895201921463, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 0.9063245058059692, + "step": 20470 + }, + { + "ce_loss": 0.07711745798587799, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.24881622195243835, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.10024960339069366, + "step": 20470 + }, + { + "epoch": 6.831220813875917, + "loss": 0.5842, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "grad_norm": 1.2597817182540894, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "learning_rate": 0.00019280891016516195, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.4338398873806, + "step": 20480 + }, + { + "ce_loss": 0.08800597488880157, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.22484973073005676, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.09363920986652374, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.6309686303138733, + "step": 20480 + }, + { + "ce_loss": 0.12850897014141083, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.24589794874191284, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.1430305540561676, + "step": 20480 + }, + { + "epoch": 6.834556370913942, + "loss": 0.6174, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "grad_norm": 1.2456542253494263, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "learning_rate": 0.00019243938880036149, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.7051937580108643, + "step": 20490 + }, + { + "ce_loss": 0.1593933254480362, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.3085207939147949, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.15409360826015472, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.7055217623710632, + "step": 20490 + }, + { + "ce_loss": 0.15836520493030548, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.3123631477355957, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.12352539598941803, + "step": 20490 + }, + { + "epoch": 6.837891927951968, + "loss": 0.5777, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "grad_norm": 1.9752660989761353, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "learning_rate": 0.00019207010968112854, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.4566480815410614, + "step": 20500 + }, + { + "ce_loss": 0.11850826442241669, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.2331375777721405, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.10483713448047638, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.5652526021003723, + "step": 20500 + }, + { + "ce_loss": 0.14450666308403015, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.23598337173461914, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.12301039695739746, + "step": 20500 + }, + { + "epoch": 6.841227484989993, + "loss": 0.5996, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "grad_norm": 1.2535284757614136, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "learning_rate": 0.0001917010732384518, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.6502880454063416, + "step": 20510 + }, + { + "ce_loss": 0.1329495757818222, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.30579808354377747, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.11312013864517212, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.4479178488254547, + "step": 20510 + }, + { + "ce_loss": 0.06486482918262482, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.24453657865524292, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.10726383328437805, + "step": 20510 + }, + { + "epoch": 6.844563042028018, + "loss": 0.6205, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "grad_norm": 1.237539529800415, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "learning_rate": 0.00019133227990303646, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.5721306204795837, + "step": 20520 + }, + { + "ce_loss": 0.13463234901428223, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.21573877334594727, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.07603821903467178, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.5392432808876038, + "step": 20520 + }, + { + "ce_loss": 0.0802914947271347, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.2541607916355133, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.07735491544008255, + "step": 20520 + }, + { + "epoch": 6.847898599066044, + "loss": 0.5919, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "grad_norm": 1.5495575666427612, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "learning_rate": 0.00019096373010530422, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.5933955311775208, + "step": 20530 + }, + { + "ce_loss": 0.11911506950855255, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.29116567969322205, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.09230422228574753, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.7542802691459656, + "step": 20530 + }, + { + "ce_loss": 0.11548207700252533, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.2644978165626526, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.13322116434574127, + "step": 20530 + }, + { + "epoch": 6.851234156104069, + "loss": 0.6136, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "grad_norm": 1.5028530359268188, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "learning_rate": 0.000190595424275392, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.5793891549110413, + "step": 20540 + }, + { + "ce_loss": 0.142436683177948, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.30143263936042786, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.1353730857372284, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.682435929775238, + "step": 20540 + }, + { + "ce_loss": 0.13634717464447021, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.32032471895217896, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.13282988965511322, + "step": 20540 + }, + { + "epoch": 6.854569713142094, + "loss": 0.6238, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "grad_norm": 2.3763959407806396, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "learning_rate": 0.00019022736284315256, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.5390588045120239, + "step": 20550 + }, + { + "ce_loss": 0.13439245522022247, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.20202970504760742, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.11675325036048889, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.7126962542533875, + "step": 20550 + }, + { + "ce_loss": 0.10502856969833374, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.2715202569961548, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.1214003637433052, + "step": 20550 + }, + { + "epoch": 6.85790527018012, + "loss": 0.5758, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "grad_norm": 1.0272542238235474, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "learning_rate": 0.0001898595462381531, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.4998115301132202, + "step": 20560 + }, + { + "ce_loss": 0.10326630622148514, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.23785142600536346, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.10605143010616302, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.5037967562675476, + "step": 20560 + }, + { + "ce_loss": 0.13062965869903564, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.2593288719654083, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.09385646879673004, + "step": 20560 + }, + { + "epoch": 6.861240827218145, + "loss": 0.5603, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "grad_norm": 1.4330753087997437, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "learning_rate": 0.0001894919748896753, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 0.44807168841362, + "step": 20570 + }, + { + "ce_loss": 0.11219009011983871, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.21106593310832977, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.10159025341272354, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 0.6952852010726929, + "step": 20570 + }, + { + "ce_loss": 0.17087072134017944, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.30461642146110535, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.1436925232410431, + "step": 20570 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.5903, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "grad_norm": 2.121004581451416, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "learning_rate": 0.00018912464922671434, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.6659446358680725, + "step": 20580 + }, + { + "ce_loss": 0.21983709931373596, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.3302499055862427, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.11565963923931122, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.5692784786224365, + "step": 20580 + }, + { + "ce_loss": 0.08923733234405518, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.2060239613056183, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.09318091720342636, + "step": 20580 + }, + { + "epoch": 6.867911941294196, + "loss": 0.5868, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "grad_norm": 1.8310197591781616, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "learning_rate": 0.0001887575696779789, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 1.1411585807800293, + "step": 20590 + }, + { + "ce_loss": 0.15374459326267242, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.2981603443622589, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.1392323076725006, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 0.4504973888397217, + "step": 20590 + }, + { + "ce_loss": 0.11747331917285919, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.20415905117988586, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.10316745191812515, + "step": 20590 + }, + { + "epoch": 6.871247498332221, + "loss": 0.6374, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "grad_norm": 1.2335033416748047, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "learning_rate": 0.00018839073667189021, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.5924480557441711, + "step": 20600 + }, + { + "ce_loss": 0.15742860734462738, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.21921710669994354, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.12174395471811295, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.3247009217739105, + "step": 20600 + }, + { + "ce_loss": 0.07317493110895157, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.1657167673110962, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.08565723896026611, + "step": 20600 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.5437, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "grad_norm": 2.104970932006836, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "learning_rate": 0.00018802415063658216, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.5424124002456665, + "step": 20610 + }, + { + "ce_loss": 0.16102339327335358, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.2664029896259308, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.08970746397972107, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.5977059006690979, + "step": 20610 + }, + { + "ce_loss": 0.0721786767244339, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.20928135514259338, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.09289656579494476, + "step": 20610 + }, + { + "epoch": 6.877918612408272, + "loss": 0.6608, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "grad_norm": 2.158698558807373, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "learning_rate": 0.00018765781199989965, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 0.8673412799835205, + "step": 20620 + }, + { + "ce_loss": 0.15912854671478271, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 0.2529142498970032, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.14166447520256042, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 0.6670170426368713, + "step": 20620 + }, + { + "ce_loss": 0.13092097640037537, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 0.2830592095851898, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.08492620289325714, + "step": 20620 + }, + { + "epoch": 6.881254169446297, + "loss": 0.6021, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "grad_norm": 1.5774734020233154, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "learning_rate": 0.0001872917211893995, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.46152612566947937, + "step": 20630 + }, + { + "ce_loss": 0.1152229830622673, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.2160659283399582, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.0975537896156311, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.501051127910614, + "step": 20630 + }, + { + "ce_loss": 0.12634457647800446, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.19545955955982208, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.10637293010950089, + "step": 20630 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.5485, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "grad_norm": 1.7691820859909058, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "learning_rate": 0.00018692587863234912, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.5000690817832947, + "step": 20640 + }, + { + "ce_loss": 0.11656507849693298, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.2345747947692871, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.11726991087198257, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.5075468420982361, + "step": 20640 + }, + { + "ce_loss": 0.10804903507232666, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.27083054184913635, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.12640805542469025, + "step": 20640 + }, + { + "epoch": 6.887925283522348, + "loss": 0.6349, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "grad_norm": 3.4797141551971436, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "learning_rate": 0.00018656028475572622, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 0.4004143476486206, + "step": 20650 + }, + { + "ce_loss": 0.06690140068531036, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 0.21979433298110962, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.10764598846435547, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 0.5797668099403381, + "step": 20650 + }, + { + "ce_loss": 0.052560314536094666, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 0.2451619654893875, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.10148611664772034, + "step": 20650 + }, + { + "epoch": 6.891260840560373, + "loss": 0.6285, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "grad_norm": 1.2031223773956299, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "learning_rate": 0.00018619493998621795, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.8077090978622437, + "step": 20660 + }, + { + "ce_loss": 0.14289332926273346, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.23440909385681152, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.14893965423107147, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.5020773410797119, + "step": 20660 + }, + { + "ce_loss": 0.1051107794046402, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.28205716609954834, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.11419624090194702, + "step": 20660 + }, + { + "epoch": 6.894596397598399, + "loss": 0.6513, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "grad_norm": 2.129122495651245, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "learning_rate": 0.0001858298447502211, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.5558404922485352, + "step": 20670 + }, + { + "ce_loss": 0.1453867107629776, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.2337738275527954, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.12421823292970657, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.5864662528038025, + "step": 20670 + }, + { + "ce_loss": 0.17098017036914825, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.267102986574173, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.11548590660095215, + "step": 20670 + }, + { + "epoch": 6.897931954636424, + "loss": 0.5988, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "grad_norm": 1.1095306873321533, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "learning_rate": 0.00018546499947384105, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.4309590756893158, + "step": 20680 + }, + { + "ce_loss": 0.10610014945268631, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.21524930000305176, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.08393006771802902, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.5816032290458679, + "step": 20680 + }, + { + "ce_loss": 0.13894164562225342, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.3071152865886688, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.13415858149528503, + "step": 20680 + }, + { + "epoch": 6.901267511674449, + "loss": 0.6277, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "grad_norm": 1.5599712133407593, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "learning_rate": 0.00018510040458289155, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.49218297004699707, + "step": 20690 + }, + { + "ce_loss": 0.1152329295873642, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.22641527652740479, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.11307185143232346, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.4263184070587158, + "step": 20690 + }, + { + "ce_loss": 0.09234665334224701, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.21400585770606995, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.08838541060686111, + "step": 20690 + }, + { + "epoch": 6.904603068712475, + "loss": 0.6255, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "grad_norm": 1.3064284324645996, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "learning_rate": 0.00018473606050289405, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.8105974197387695, + "step": 20700 + }, + { + "ce_loss": 0.23844876885414124, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.3396758735179901, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.23215480148792267, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.5795097351074219, + "step": 20700 + }, + { + "ce_loss": 0.11340656131505966, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.29568004608154297, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.11840888112783432, + "step": 20700 + }, + { + "epoch": 6.9079386257505, + "loss": 0.6162, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "grad_norm": 1.2173100709915161, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "learning_rate": 0.00018437196765907728, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.7617021799087524, + "step": 20710 + }, + { + "ce_loss": 0.17053262889385223, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.3001687824726105, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.17243169248104095, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.641623854637146, + "step": 20710 + }, + { + "ce_loss": 0.1281440109014511, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.320505827665329, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.11534615606069565, + "step": 20710 + }, + { + "epoch": 6.911274182788525, + "loss": 0.6241, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "grad_norm": 1.6815094947814941, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "learning_rate": 0.00018400812647637697, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.9845396280288696, + "step": 20720 + }, + { + "ce_loss": 0.07850717008113861, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.21867156028747559, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.11635127663612366, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.4374261498451233, + "step": 20720 + }, + { + "ce_loss": 0.06454581767320633, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.268862783908844, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.10372491180896759, + "step": 20720 + }, + { + "epoch": 6.914609739826551, + "loss": 0.5813, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "grad_norm": 1.5123430490493774, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "learning_rate": 0.0001836445373794346, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.4976940453052521, + "step": 20730 + }, + { + "ce_loss": 0.12870489060878754, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.23885779082775116, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.129926860332489, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.671566903591156, + "step": 20730 + }, + { + "ce_loss": 0.1277875006198883, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.22397693991661072, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.12168483436107635, + "step": 20730 + }, + { + "epoch": 6.917945296864576, + "loss": 0.5684, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "grad_norm": 3.491680383682251, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "learning_rate": 0.00018328120079259792, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.5865914225578308, + "step": 20740 + }, + { + "ce_loss": 0.16594484448432922, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.2859644889831543, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.13439278304576874, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.26961272954940796, + "step": 20740 + }, + { + "ce_loss": 0.038990482687950134, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.16177760064601898, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.06862502545118332, + "step": 20740 + }, + { + "epoch": 6.921280853902601, + "loss": 0.5856, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "grad_norm": 1.7987585067749023, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "learning_rate": 0.00018291811713991982, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.7716285586357117, + "step": 20750 + }, + { + "ce_loss": 0.18918243050575256, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.26490822434425354, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.13002419471740723, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.44953399896621704, + "step": 20750 + }, + { + "ce_loss": 0.07777289301156998, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.2360752373933792, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.1030561625957489, + "step": 20750 + }, + { + "epoch": 6.924616410940627, + "loss": 0.5994, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "grad_norm": 1.4929414987564087, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "learning_rate": 0.00018255528684515816, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.6236853003501892, + "step": 20760 + }, + { + "ce_loss": 0.14601117372512817, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.2984391450881958, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.12102136015892029, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.8715800046920776, + "step": 20760 + }, + { + "ce_loss": 0.132913276553154, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.3128644526004791, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.1070551872253418, + "step": 20760 + }, + { + "epoch": 6.927951967978652, + "loss": 0.5638, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "grad_norm": 2.015686511993408, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "learning_rate": 0.0001821927103317746, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.723768413066864, + "step": 20770 + }, + { + "ce_loss": 0.17608648538589478, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.37144795060157776, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.11942605674266815, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.5449997782707214, + "step": 20770 + }, + { + "ce_loss": 0.09751887619495392, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.2715526819229126, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.09283023327589035, + "step": 20770 + }, + { + "epoch": 6.931287525016677, + "loss": 0.6042, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "grad_norm": 1.518445611000061, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "learning_rate": 0.0001818303880229351, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.7221246957778931, + "step": 20780 + }, + { + "ce_loss": 0.16267530620098114, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.3028196394443512, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.1369720995426178, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.6272754073143005, + "step": 20780 + }, + { + "ce_loss": 0.1474248319864273, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.3028441071510315, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.1394912451505661, + "step": 20780 + }, + { + "epoch": 6.934623082054703, + "loss": 0.6315, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "grad_norm": 2.4681501388549805, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "learning_rate": 0.00018146832034150867, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.5065264105796814, + "step": 20790 + }, + { + "ce_loss": 0.09136957675218582, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.2777387499809265, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.0956275686621666, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.5836489200592041, + "step": 20790 + }, + { + "ce_loss": 0.11539237946271896, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.27783769369125366, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.11264175921678543, + "step": 20790 + }, + { + "epoch": 6.937958639092728, + "loss": 0.634, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "grad_norm": 2.4432549476623535, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "learning_rate": 0.00018110650771006772, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.6803032159805298, + "step": 20800 + }, + { + "ce_loss": 0.1700230836868286, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.3196053206920624, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.14488141238689423, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.4583851993083954, + "step": 20800 + }, + { + "ce_loss": 0.1260090172290802, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.2324456125497818, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.09967635571956635, + "step": 20800 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.5952, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "grad_norm": 2.0505409240722656, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "learning_rate": 0.00018074495055088598, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 1.0997909307479858, + "step": 20810 + }, + { + "ce_loss": 0.10792729258537292, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.28397682309150696, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.10799378156661987, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.6416740417480469, + "step": 20810 + }, + { + "ce_loss": 0.10900291800498962, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.2863513231277466, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.11546456813812256, + "step": 20810 + }, + { + "epoch": 6.944629753168779, + "loss": 0.6465, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "grad_norm": 3.0398612022399902, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "learning_rate": 0.0001803836492859398, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.6857156753540039, + "step": 20820 + }, + { + "ce_loss": 0.13258329033851624, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.24844446778297424, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.11770449578762054, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.7357656359672546, + "step": 20820 + }, + { + "ce_loss": 0.19352516531944275, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.30092155933380127, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.13199712336063385, + "step": 20820 + }, + { + "epoch": 6.947965310206804, + "loss": 0.595, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "grad_norm": 1.429013729095459, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "learning_rate": 0.00018002260433690656, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.5082059502601624, + "step": 20830 + }, + { + "ce_loss": 0.12579958140850067, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.23692786693572998, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.1452646404504776, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.5477200746536255, + "step": 20830 + }, + { + "ce_loss": 0.08218104392290115, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.23850448429584503, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.10290637612342834, + "step": 20830 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.5805, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "grad_norm": 2.0708327293395996, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "learning_rate": 0.00017966181612516478, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.5284128785133362, + "step": 20840 + }, + { + "ce_loss": 0.13292750716209412, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.254790335893631, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.11771418154239655, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.5159513354301453, + "step": 20840 + }, + { + "ce_loss": 0.10002075135707855, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.22777974605560303, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.10233756899833679, + "step": 20840 + }, + { + "epoch": 6.954636424282855, + "loss": 0.5243, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "grad_norm": 1.4802135229110718, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "learning_rate": 0.00017930128507179281, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.605171799659729, + "step": 20850 + }, + { + "ce_loss": 0.16204921901226044, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.31319138407707214, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.09180094301700592, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.4061069190502167, + "step": 20850 + }, + { + "ce_loss": 0.1162717342376709, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.17830415070056915, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.08810706436634064, + "step": 20850 + }, + { + "epoch": 6.95797198132088, + "loss": 0.5665, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "grad_norm": 1.2751325368881226, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "learning_rate": 0.00017894101159756932, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 1.0285208225250244, + "step": 20860 + }, + { + "ce_loss": 0.2166115641593933, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.24041934311389923, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.19319362938404083, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 0.6077567934989929, + "step": 20860 + }, + { + "ce_loss": 0.18720628321170807, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.28821271657943726, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.13216865062713623, + "step": 20860 + }, + { + "epoch": 6.961307538358906, + "loss": 0.6149, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "grad_norm": 1.9878860712051392, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "learning_rate": 0.00017858099612297226, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.7590935230255127, + "step": 20870 + }, + { + "ce_loss": 0.14209294319152832, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.2697787284851074, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.12165077030658722, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.7105188965797424, + "step": 20870 + }, + { + "ce_loss": 0.13031932711601257, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.2726278007030487, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.14885155856609344, + "step": 20870 + }, + { + "epoch": 6.964643095396931, + "loss": 0.5828, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "grad_norm": 1.4370341300964355, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "learning_rate": 0.00017822123906817848, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.5326735377311707, + "step": 20880 + }, + { + "ce_loss": 0.14173398911952972, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.29813286662101746, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.09259048104286194, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.5765227675437927, + "step": 20880 + }, + { + "ce_loss": 0.15944123268127441, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.268841028213501, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.11185889691114426, + "step": 20880 + }, + { + "epoch": 6.967978652434956, + "loss": 0.602, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "grad_norm": 2.715527057647705, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "learning_rate": 0.000177861740853063, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.7061032056808472, + "step": 20890 + }, + { + "ce_loss": 0.14459113776683807, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.2508234977722168, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.14969336986541748, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.6149274706840515, + "step": 20890 + }, + { + "ce_loss": 0.12304043024778366, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.2720663845539093, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.11349066346883774, + "step": 20890 + }, + { + "epoch": 6.971314209472982, + "loss": 0.5916, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "grad_norm": 1.2655351161956787, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "learning_rate": 0.00017750250189719883, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.5163251161575317, + "step": 20900 + }, + { + "ce_loss": 0.14423733949661255, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.21587130427360535, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.10627759993076324, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.5793828964233398, + "step": 20900 + }, + { + "ce_loss": 0.09142829477787018, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.2316550761461258, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.12005052715539932, + "step": 20900 + }, + { + "epoch": 6.974649766511007, + "loss": 0.6028, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "grad_norm": 2.137049674987793, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "learning_rate": 0.00017714352261985697, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.710256814956665, + "step": 20910 + }, + { + "ce_loss": 0.17917829751968384, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.2954549789428711, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.13196998834609985, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.5921691656112671, + "step": 20910 + }, + { + "ce_loss": 0.1159336194396019, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.2698439061641693, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.11621291190385818, + "step": 20910 + }, + { + "epoch": 6.977985323549032, + "loss": 0.5851, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "grad_norm": 4.465473175048828, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "learning_rate": 0.00017678480344000442, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.7718786001205444, + "step": 20920 + }, + { + "ce_loss": 0.10429341346025467, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.2350814789533615, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.11928503215312958, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.5180379152297974, + "step": 20920 + }, + { + "ce_loss": 0.11934104561805725, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.26697003841400146, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.10038395971059799, + "step": 20920 + }, + { + "epoch": 6.981320880587058, + "loss": 0.5868, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "grad_norm": 1.331379771232605, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "learning_rate": 0.00017642634477630517, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.5460637211799622, + "step": 20930 + }, + { + "ce_loss": 0.1347649246454239, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.28631535172462463, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.12491101026535034, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.7150649428367615, + "step": 20930 + }, + { + "ce_loss": 0.16883759200572968, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.31282228231430054, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.17268231511116028, + "step": 20930 + }, + { + "epoch": 6.984656437625083, + "loss": 0.5762, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "grad_norm": 1.295901894569397, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "learning_rate": 0.00017606814704711915, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.6643977761268616, + "step": 20940 + }, + { + "ce_loss": 0.1359083652496338, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.23353464901447296, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.11746269464492798, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.6325167417526245, + "step": 20940 + }, + { + "ce_loss": 0.14522510766983032, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.21115204691886902, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.09273610264062881, + "step": 20940 + }, + { + "epoch": 6.987991994663108, + "loss": 0.5873, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "grad_norm": 2.0272459983825684, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "learning_rate": 0.00017571021067050153, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.6666278839111328, + "step": 20950 + }, + { + "ce_loss": 0.14961141347885132, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.2823046147823334, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.1209111213684082, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.5868027806282043, + "step": 20950 + }, + { + "ce_loss": 0.14440715312957764, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.23055055737495422, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.1070292592048645, + "step": 20950 + }, + { + "epoch": 6.991327551701134, + "loss": 0.5565, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "grad_norm": 1.3845330476760864, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "learning_rate": 0.0001753525360642028, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.7456711530685425, + "step": 20960 + }, + { + "ce_loss": 0.15054984390735626, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.32618165016174316, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.12339648604393005, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.5007907748222351, + "step": 20960 + }, + { + "ce_loss": 0.14651557803153992, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.2281855344772339, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.11004862934350967, + "step": 20960 + }, + { + "epoch": 6.994663108739159, + "loss": 0.6319, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "grad_norm": 1.6815906763076782, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "learning_rate": 0.0001749951236456674, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.5305715799331665, + "step": 20970 + }, + { + "ce_loss": 0.08651381731033325, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.24971520900726318, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.10385092347860336, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.6306924223899841, + "step": 20970 + }, + { + "ce_loss": 0.08177436143159866, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.32261404395103455, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.13266070187091827, + "step": 20970 + }, + { + "epoch": 6.997998665777184, + "loss": 0.5823, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "grad_norm": 2.4300663471221924, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "learning_rate": 0.00017463797383203425, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.5562842488288879, + "step": 20980 + }, + { + "ce_loss": 0.12637288868427277, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.27352625131607056, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.10983546078205109, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.5764535069465637, + "step": 20980 + }, + { + "ce_loss": 0.151265949010849, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.28327447175979614, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.11406426131725311, + "step": 20980 + }, + { + "epoch": 7.00133422281521, + "loss": 0.5262, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "grad_norm": 1.4760485887527466, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "learning_rate": 0.0001742810870401356, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.41110774874687195, + "step": 20990 + }, + { + "ce_loss": 0.08459983021020889, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.23713694512844086, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.08919335901737213, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.8198530673980713, + "step": 20990 + }, + { + "ce_loss": 0.17153599858283997, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.24566666781902313, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.12967519462108612, + "step": 20990 + }, + { + "epoch": 7.004669779853235, + "loss": 0.5345, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "grad_norm": 1.178972601890564, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "learning_rate": 0.00017392446368649686, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.4565522372722626, + "step": 21000 + }, + { + "ce_loss": 0.08238189667463303, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.2576136887073517, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.07807490974664688, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.3339233100414276, + "step": 21000 + }, + { + "ce_loss": 0.05025802552700043, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.17898201942443848, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.06952040642499924, + "step": 21000 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.5173, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "grad_norm": 1.0856938362121582, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "learning_rate": 0.00017356810418733547, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.4113653302192688, + "step": 21010 + }, + { + "ce_loss": 0.07443442195653915, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.20666775107383728, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.09216611832380295, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.41373708844184875, + "step": 21010 + }, + { + "ce_loss": 0.07116129994392395, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.21457628905773163, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.1037018895149231, + "step": 21010 + }, + { + "epoch": 7.011340893929286, + "loss": 0.5155, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "grad_norm": 1.611261248588562, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "learning_rate": 0.00017321200895856168, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.49294358491897583, + "step": 21020 + }, + { + "ce_loss": 0.11892196536064148, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.1827605664730072, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.11067811399698257, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.5166816711425781, + "step": 21020 + }, + { + "ce_loss": 0.07782851159572601, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.2171994149684906, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.0989086925983429, + "step": 21020 + }, + { + "epoch": 7.014676450967311, + "loss": 0.5386, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "grad_norm": 1.1420142650604248, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "learning_rate": 0.00017285617841577704, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.5629929304122925, + "step": 21030 + }, + { + "ce_loss": 0.11045968532562256, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.23228420317173004, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.10397525131702423, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.44841301441192627, + "step": 21030 + }, + { + "ce_loss": 0.09309506416320801, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.21713057160377502, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.08877697587013245, + "step": 21030 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.5425, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "grad_norm": 1.5629714727401733, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "learning_rate": 0.00017250061297427368, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.38156411051750183, + "step": 21040 + }, + { + "ce_loss": 0.06925290822982788, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.20131511986255646, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.09016577154397964, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.6269943714141846, + "step": 21040 + }, + { + "ce_loss": 0.05716466158628464, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.18947528302669525, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.10615312308073044, + "step": 21040 + }, + { + "epoch": 7.021347565043362, + "loss": 0.5463, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "grad_norm": 1.1643590927124023, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "learning_rate": 0.00017214531304903492, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.49292388558387756, + "step": 21050 + }, + { + "ce_loss": 0.09436366707086563, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.2636721730232239, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.1069205105304718, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.5487685203552246, + "step": 21050 + }, + { + "ce_loss": 0.11184797435998917, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.2645128667354584, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.09122076630592346, + "step": 21050 + }, + { + "epoch": 7.024683122081387, + "loss": 0.5474, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "grad_norm": 3.8690576553344727, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "learning_rate": 0.00017179027905473403, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.39867842197418213, + "step": 21060 + }, + { + "ce_loss": 0.042192067950963974, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.2422800064086914, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.09038354456424713, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.4865795373916626, + "step": 21060 + }, + { + "ce_loss": 0.07612650096416473, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.2698541283607483, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.10052892565727234, + "step": 21060 + }, + { + "epoch": 7.028018679119413, + "loss": 0.5082, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "grad_norm": 1.5303665399551392, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "learning_rate": 0.000171435511405734, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.33546188473701477, + "step": 21070 + }, + { + "ce_loss": 0.06674647331237793, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.1889113336801529, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.07956317812204361, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.5631605982780457, + "step": 21070 + }, + { + "ce_loss": 0.15716561675071716, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.23613443970680237, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.11336501687765121, + "step": 21070 + }, + { + "epoch": 7.031354236157438, + "loss": 0.5358, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "grad_norm": 1.5098323822021484, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "learning_rate": 0.00017108101051608657, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.3946211338043213, + "step": 21080 + }, + { + "ce_loss": 0.0674547627568245, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.1942397654056549, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.08900465071201324, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.7322803735733032, + "step": 21080 + }, + { + "ce_loss": 0.09472820162773132, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.2562478184700012, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.09331964701414108, + "step": 21080 + }, + { + "epoch": 7.034689793195463, + "loss": 0.5493, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "grad_norm": 1.0239431858062744, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "learning_rate": 0.0001707267767995326, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.43437060713768005, + "step": 21090 + }, + { + "ce_loss": 0.08299209922552109, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.22612957656383514, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.1250525861978531, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.48301953077316284, + "step": 21090 + }, + { + "ce_loss": 0.07930684089660645, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.2962947189807892, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.1071903184056282, + "step": 21090 + }, + { + "epoch": 7.038025350233489, + "loss": 0.5005, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "grad_norm": 1.2620716094970703, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "learning_rate": 0.0001703728106695009, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.5874612331390381, + "step": 21100 + }, + { + "ce_loss": 0.14442382752895355, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.25857293605804443, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.1242932677268982, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.30933573842048645, + "step": 21100 + }, + { + "ce_loss": 0.038306642323732376, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.17048147320747375, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.03882153332233429, + "step": 21100 + }, + { + "epoch": 7.041360907271514, + "loss": 0.4931, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "grad_norm": 1.7004448175430298, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "learning_rate": 0.00017001911253910817, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.491072416305542, + "step": 21110 + }, + { + "ce_loss": 0.07500729709863663, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.23993931710720062, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.07017786055803299, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.5245043635368347, + "step": 21110 + }, + { + "ce_loss": 0.09140478074550629, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.19429965317249298, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.0974140390753746, + "step": 21110 + }, + { + "epoch": 7.044696464309539, + "loss": 0.4823, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "grad_norm": 1.2850415706634521, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "learning_rate": 0.00016966568282115785, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.5977675318717957, + "step": 21120 + }, + { + "ce_loss": 0.1628226935863495, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.2581752836704254, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.13995656371116638, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.5749611854553223, + "step": 21120 + }, + { + "ce_loss": 0.1332138031721115, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.277914434671402, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.12948699295520782, + "step": 21120 + }, + { + "epoch": 7.048032021347565, + "loss": 0.5306, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "grad_norm": 1.5037014484405518, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "learning_rate": 0.0001693125219281408, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.4484368562698364, + "step": 21130 + }, + { + "ce_loss": 0.07722995430231094, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.19566670060157776, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.10668282955884933, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.36792391538619995, + "step": 21130 + }, + { + "ce_loss": 0.08133158087730408, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.20554950833320618, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.061777785420417786, + "step": 21130 + }, + { + "epoch": 7.05136757838559, + "loss": 0.5248, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "grad_norm": 3.220675468444824, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "learning_rate": 0.00016895963027223365, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.3645973801612854, + "step": 21140 + }, + { + "ce_loss": 0.09136815369129181, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.1671840101480484, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.10586903989315033, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.45609939098358154, + "step": 21140 + }, + { + "ce_loss": 0.06002714857459068, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.19941852986812592, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.11288649588823318, + "step": 21140 + }, + { + "epoch": 7.054703135423615, + "loss": 0.5007, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "grad_norm": 1.32329261302948, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "learning_rate": 0.00016860700826529907, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.5643146634101868, + "step": 21150 + }, + { + "ce_loss": 0.1046956479549408, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.24031804502010345, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.10297542810440063, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.2995739281177521, + "step": 21150 + }, + { + "ce_loss": 0.040338970720767975, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.1610741913318634, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.06947105377912521, + "step": 21150 + }, + { + "epoch": 7.058038692461641, + "loss": 0.5235, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "grad_norm": 1.9927266836166382, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "learning_rate": 0.0001682546563188846, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.3663199245929718, + "step": 21160 + }, + { + "ce_loss": 0.06230630725622177, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.18640950322151184, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.08341743797063828, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.43763241171836853, + "step": 21160 + }, + { + "ce_loss": 0.08718358725309372, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.22233569622039795, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.10211227089166641, + "step": 21160 + }, + { + "epoch": 7.061374249499666, + "loss": 0.5244, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "grad_norm": 4.421433925628662, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "learning_rate": 0.0001679025748442231, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.4997464120388031, + "step": 21170 + }, + { + "ce_loss": 0.10188456624746323, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.23916609585285187, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.11447103321552277, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.5505445003509521, + "step": 21170 + }, + { + "ce_loss": 0.1000547856092453, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.2162446230649948, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.09203381836414337, + "step": 21170 + }, + { + "epoch": 7.064709806537691, + "loss": 0.5328, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "grad_norm": 2.3827109336853027, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "learning_rate": 0.00016755076425223147, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.43425941467285156, + "step": 21180 + }, + { + "ce_loss": 0.12137731909751892, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.22484569251537323, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.08779873698949814, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.4549023509025574, + "step": 21180 + }, + { + "ce_loss": 0.07701903581619263, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.22629885375499725, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.08000829070806503, + "step": 21180 + }, + { + "epoch": 7.068045363575717, + "loss": 0.4869, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "grad_norm": 1.7620294094085693, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "learning_rate": 0.00016719922495351064, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.42543548345565796, + "step": 21190 + }, + { + "ce_loss": 0.09694749861955643, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.20173221826553345, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.09933895617723465, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.6733165979385376, + "step": 21190 + }, + { + "ce_loss": 0.11281838268041611, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.21210214495658875, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.08334226161241531, + "step": 21190 + }, + { + "epoch": 7.071380920613742, + "loss": 0.5431, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "grad_norm": 1.6558399200439453, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "learning_rate": 0.00016684795735834453, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.41487324237823486, + "step": 21200 + }, + { + "ce_loss": 0.07864386588335037, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.21175822615623474, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.0927249863743782, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.33858710527420044, + "step": 21200 + }, + { + "ce_loss": 0.06142342835664749, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.1634354591369629, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.08492116630077362, + "step": 21200 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.4871, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "grad_norm": 1.2901017665863037, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "learning_rate": 0.00016649696187670041, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.3924500644207001, + "step": 21210 + }, + { + "ce_loss": 0.07660618424415588, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.20859317481517792, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.07732430845499039, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.5552083253860474, + "step": 21210 + }, + { + "ce_loss": 0.0491691492497921, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.18049979209899902, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.07998926937580109, + "step": 21210 + }, + { + "epoch": 7.078052034689793, + "loss": 0.4609, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "grad_norm": 1.6316123008728027, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "learning_rate": 0.00016614623891822778, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.5180302262306213, + "step": 21220 + }, + { + "ce_loss": 0.11370652168989182, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.23079824447631836, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.11144647747278214, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.4807301163673401, + "step": 21220 + }, + { + "ce_loss": 0.12959855794906616, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.21934184432029724, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.10569732636213303, + "step": 21220 + }, + { + "epoch": 7.081387591727818, + "loss": 0.4896, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "grad_norm": 1.6368104219436646, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "learning_rate": 0.00016579578889225796, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.3786579668521881, + "step": 21230 + }, + { + "ce_loss": 0.09192191064357758, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.20896805822849274, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.07722792774438858, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.34697994589805603, + "step": 21230 + }, + { + "ce_loss": 0.06574396044015884, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.1986175775527954, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.08240870386362076, + "step": 21230 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.4941, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "grad_norm": 38.46177291870117, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "learning_rate": 0.000165445612207804, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.4953051507472992, + "step": 21240 + }, + { + "ce_loss": 0.14665032923221588, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.25713422894477844, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.09130918979644775, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.4350004494190216, + "step": 21240 + }, + { + "ce_loss": 0.05916459113359451, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.2253180742263794, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.1087227463722229, + "step": 21240 + }, + { + "epoch": 7.088058705803869, + "loss": 0.4739, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "grad_norm": 1.1803487539291382, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "learning_rate": 0.00016509570927355962, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.45185840129852295, + "step": 21250 + }, + { + "ce_loss": 0.04442691057920456, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.16830293834209442, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.06186540797352791, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.4900377094745636, + "step": 21250 + }, + { + "ce_loss": 0.11686450242996216, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.25501832365989685, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.11018063873052597, + "step": 21250 + }, + { + "epoch": 7.091394262841894, + "loss": 0.4635, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "grad_norm": 1.046940803527832, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "learning_rate": 0.00016474608049789943, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.35495126247406006, + "step": 21260 + }, + { + "ce_loss": 0.0532936193048954, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.1743641197681427, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.09107319265604019, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.43149006366729736, + "step": 21260 + }, + { + "ce_loss": 0.0692746564745903, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.22161293029785156, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.0617385059595108, + "step": 21260 + }, + { + "epoch": 7.09472981987992, + "loss": 0.49, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "grad_norm": 1.3952606916427612, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "learning_rate": 0.00016439672628887757, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.6341660022735596, + "step": 21270 + }, + { + "ce_loss": 0.15768156945705414, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.27653753757476807, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.10778310894966125, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.42467135190963745, + "step": 21270 + }, + { + "ce_loss": 0.07942281663417816, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.23953743278980255, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.08827744424343109, + "step": 21270 + }, + { + "epoch": 7.098065376917945, + "loss": 0.6013, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "grad_norm": 1.2766448259353638, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "learning_rate": 0.00016404764705422802, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.4827376902103424, + "step": 21280 + }, + { + "ce_loss": 0.06714701652526855, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.29545265436172485, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.08229733258485794, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.6882094144821167, + "step": 21280 + }, + { + "ce_loss": 0.08233946561813354, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.2522861659526825, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.09695421904325485, + "step": 21280 + }, + { + "epoch": 7.10140093395597, + "loss": 0.5352, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "grad_norm": 0.9868097901344299, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "learning_rate": 0.00016369884320136392, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 1.093414545059204, + "step": 21290 + }, + { + "ce_loss": 0.10553121566772461, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.28743916749954224, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.11427765339612961, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 0.5632927417755127, + "step": 21290 + }, + { + "ce_loss": 0.07510565966367722, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.2500094473361969, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.07853087037801743, + "step": 21290 + }, + { + "epoch": 7.104736490993996, + "loss": 0.5351, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "grad_norm": 1.2402269840240479, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "learning_rate": 0.00016335031513737687, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.43117618560791016, + "step": 21300 + }, + { + "ce_loss": 0.08685576915740967, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.26075318455696106, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.08343677967786789, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.48218634724617004, + "step": 21300 + }, + { + "ce_loss": 0.0882478728890419, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.2749446928501129, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.08692453801631927, + "step": 21300 + }, + { + "epoch": 7.108072048032021, + "loss": 0.4891, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "grad_norm": 1.0379635095596313, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "learning_rate": 0.00016300206326903672, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.5212631225585938, + "step": 21310 + }, + { + "ce_loss": 0.1073252409696579, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.27759629487991333, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.09776793420314789, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.4921794533729553, + "step": 21310 + }, + { + "ce_loss": 0.10279887169599533, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.2897863984107971, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.09936974197626114, + "step": 21310 + }, + { + "epoch": 7.111407605070046, + "loss": 0.5527, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "grad_norm": 2.3853776454925537, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "learning_rate": 0.0001626540880027907, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.3984506130218506, + "step": 21320 + }, + { + "ce_loss": 0.06048259884119034, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.2146771401166916, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.07563339173793793, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.5447241067886353, + "step": 21320 + }, + { + "ce_loss": 0.07636505365371704, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.22990357875823975, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.09303893148899078, + "step": 21320 + }, + { + "epoch": 7.114743162108072, + "loss": 0.5337, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "grad_norm": 1.1696761846542358, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "learning_rate": 0.00016230638974476337, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.5816092491149902, + "step": 21330 + }, + { + "ce_loss": 0.11028946191072464, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.21065345406532288, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.07422541826963425, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.4654453992843628, + "step": 21330 + }, + { + "ce_loss": 0.08083511143922806, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.25947773456573486, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.09762498736381531, + "step": 21330 + }, + { + "epoch": 7.118078719146097, + "loss": 0.5283, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "grad_norm": 1.145909070968628, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "learning_rate": 0.00016195896890075617, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.9589320421218872, + "step": 21340 + }, + { + "ce_loss": 0.08689472824335098, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.294956773519516, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.09442543238401413, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.4515564441680908, + "step": 21340 + }, + { + "ce_loss": 0.1072135865688324, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.25195497274398804, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.09227581322193146, + "step": 21340 + }, + { + "epoch": 7.121414276184122, + "loss": 0.5224, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "grad_norm": 1.4755406379699707, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "learning_rate": 0.0001616118258762465, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.9713801145553589, + "step": 21350 + }, + { + "ce_loss": 0.16981974244117737, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.3541743755340576, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.12030630558729172, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.3955955505371094, + "step": 21350 + }, + { + "ce_loss": 0.0644855797290802, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.23553195595741272, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.06928061693906784, + "step": 21350 + }, + { + "epoch": 7.124749833222148, + "loss": 0.5653, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "grad_norm": 1.499052882194519, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "learning_rate": 0.00016126496107638766, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.4505382180213928, + "step": 21360 + }, + { + "ce_loss": 0.09331570565700531, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.2338477224111557, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.09295643866062164, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.49329978227615356, + "step": 21360 + }, + { + "ce_loss": 0.09330855309963226, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.24846550822257996, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.09859941154718399, + "step": 21360 + }, + { + "epoch": 7.128085390260173, + "loss": 0.4869, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "grad_norm": 1.757819414138794, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "learning_rate": 0.0001609183749060082, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.4814741015434265, + "step": 21370 + }, + { + "ce_loss": 0.11460345983505249, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.22094333171844482, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.11573362350463867, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.41160690784454346, + "step": 21370 + }, + { + "ce_loss": 0.09539221227169037, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.20575033128261566, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.07427817583084106, + "step": 21370 + }, + { + "epoch": 7.131420947298198, + "loss": 0.5221, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "grad_norm": 1.0497280359268188, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "learning_rate": 0.0001605720677696116, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.5087268352508545, + "step": 21380 + }, + { + "ce_loss": 0.11951367557048798, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.24779492616653442, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.10620385408401489, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.4907005727291107, + "step": 21380 + }, + { + "ce_loss": 0.03994790464639664, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.21446077525615692, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.0749351978302002, + "step": 21380 + }, + { + "epoch": 7.134756504336224, + "loss": 0.5462, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "grad_norm": 1.3052724599838257, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "learning_rate": 0.00016022604007137533, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 0.7197709083557129, + "step": 21390 + }, + { + "ce_loss": 0.14835871756076813, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.23564325273036957, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.1125996932387352, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 1.0123602151870728, + "step": 21390 + }, + { + "ce_loss": 0.12748490273952484, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.2982759177684784, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.12087155878543854, + "step": 21390 + }, + { + "epoch": 7.138092061374249, + "loss": 0.5642, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "grad_norm": 1.72278892993927, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "learning_rate": 0.000159880292215151, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.5797778367996216, + "step": 21400 + }, + { + "ce_loss": 0.11908047646284103, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.26884138584136963, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.11582252383232117, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.4562729299068451, + "step": 21400 + }, + { + "ce_loss": 0.06269034743309021, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.23549781739711761, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.1063077449798584, + "step": 21400 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.5039, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "grad_norm": 1.1651095151901245, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "learning_rate": 0.00015953482460446362, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.33815377950668335, + "step": 21410 + }, + { + "ce_loss": 0.06978192925453186, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.17233459651470184, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.07788750529289246, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.49239495396614075, + "step": 21410 + }, + { + "ce_loss": 0.10934551805257797, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.23807445168495178, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.1099422425031662, + "step": 21410 + }, + { + "epoch": 7.1447631754503, + "loss": 0.5337, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "grad_norm": 1.296330451965332, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "learning_rate": 0.00015918963764251118, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.5494571328163147, + "step": 21420 + }, + { + "ce_loss": 0.11908654123544693, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.2070154845714569, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.12471811473369598, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.5093758702278137, + "step": 21420 + }, + { + "ce_loss": 0.11129189282655716, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.2689700126647949, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.1290213018655777, + "step": 21420 + }, + { + "epoch": 7.148098732488325, + "loss": 0.5399, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "grad_norm": 2.2043728828430176, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "learning_rate": 0.00015884473173216374, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.4269636273384094, + "step": 21430 + }, + { + "ce_loss": 0.08743033558130264, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.2163121998310089, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.12303852289915085, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.47688212990760803, + "step": 21430 + }, + { + "ce_loss": 0.11899134516716003, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.20621243119239807, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.07384546101093292, + "step": 21430 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.5083, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "grad_norm": 1.2066408395767212, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "learning_rate": 0.00015850010727596375, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.49252405762672424, + "step": 21440 + }, + { + "ce_loss": 0.08908739686012268, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.22361807525157928, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.08267315477132797, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.5239272117614746, + "step": 21440 + }, + { + "ce_loss": 0.10164398699998856, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.24876506626605988, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.0726809948682785, + "step": 21440 + }, + { + "epoch": 7.154769846564376, + "loss": 0.4696, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "grad_norm": 1.5307971239089966, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "learning_rate": 0.00015815576467612504, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.3120725452899933, + "step": 21450 + }, + { + "ce_loss": 0.05826739966869354, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.1749739646911621, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.058990590274333954, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.3526802361011505, + "step": 21450 + }, + { + "ce_loss": 0.06731356680393219, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.19125425815582275, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.06343653798103333, + "step": 21450 + }, + { + "epoch": 7.158105403602401, + "loss": 0.5202, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "grad_norm": 1.4682691097259521, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "learning_rate": 0.0001578117043345325, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.5320351719856262, + "step": 21460 + }, + { + "ce_loss": 0.11158326268196106, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.22310832142829895, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.1012367531657219, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.38997533917427063, + "step": 21460 + }, + { + "ce_loss": 0.06510186195373535, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.20934811234474182, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.0889061838388443, + "step": 21460 + }, + { + "epoch": 7.161440960640427, + "loss": 0.4985, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "grad_norm": 1.1266052722930908, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "learning_rate": 0.0001574679266527415, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.46760207414627075, + "step": 21470 + }, + { + "ce_loss": 0.10974862426519394, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.21274375915527344, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.08004342019557953, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.4235474765300751, + "step": 21470 + }, + { + "ce_loss": 0.07097727060317993, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.18562710285186768, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.0987720862030983, + "step": 21470 + }, + { + "epoch": 7.164776517678452, + "loss": 0.452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "grad_norm": 1.5619783401489258, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "learning_rate": 0.00015712443203197763, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.8044539093971252, + "step": 21480 + }, + { + "ce_loss": 0.17771480977535248, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.19461336731910706, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.14369718730449677, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.4453772306442261, + "step": 21480 + }, + { + "ce_loss": 0.10373017191886902, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.1853133887052536, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.10268549621105194, + "step": 21480 + }, + { + "epoch": 7.168112074716477, + "loss": 0.4631, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "grad_norm": 36.443809509277344, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "learning_rate": 0.00015678122087313607, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.5804122686386108, + "step": 21490 + }, + { + "ce_loss": 0.05610696226358414, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.217197448015213, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.10133112967014313, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.564479649066925, + "step": 21490 + }, + { + "ce_loss": 0.1543521136045456, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.23337118327617645, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.11876802891492844, + "step": 21490 + }, + { + "epoch": 7.171447631754503, + "loss": 0.4692, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "grad_norm": 1.0185561180114746, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "learning_rate": 0.00015643829357678133, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.37515789270401, + "step": 21500 + }, + { + "ce_loss": 0.05240470916032791, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.19510981440544128, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.08341556787490845, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.38667699694633484, + "step": 21500 + }, + { + "ce_loss": 0.06962460279464722, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.23856179416179657, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.07823915034532547, + "step": 21500 + }, + { + "epoch": 7.174783188792528, + "loss": 0.4711, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "grad_norm": 1.8188211917877197, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "learning_rate": 0.00015609565054314616, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.3922473192214966, + "step": 21510 + }, + { + "ce_loss": 0.08343679457902908, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.21345853805541992, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.07090325653553009, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.48854389786720276, + "step": 21510 + }, + { + "ce_loss": 0.14709074795246124, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.21674580872058868, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.12438128143548965, + "step": 21510 + }, + { + "epoch": 7.178118745830553, + "loss": 0.4811, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "grad_norm": 1.4690632820129395, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "learning_rate": 0.00015575329217213199, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.3819815218448639, + "step": 21520 + }, + { + "ce_loss": 0.056364353746175766, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.18936829268932343, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.09404139965772629, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.47050580382347107, + "step": 21520 + }, + { + "ce_loss": 0.1282368004322052, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.21063852310180664, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.10341715812683105, + "step": 21520 + }, + { + "epoch": 7.181454302868579, + "loss": 0.467, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "grad_norm": 1.3604241609573364, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "learning_rate": 0.00015541121886330795, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.4614931344985962, + "step": 21530 + }, + { + "ce_loss": 0.10464883595705032, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.26484405994415283, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.07564658671617508, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.5691704750061035, + "step": 21530 + }, + { + "ce_loss": 0.08403594046831131, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.21320290863513947, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.09035076946020126, + "step": 21530 + }, + { + "epoch": 7.184789859906604, + "loss": 0.4867, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "grad_norm": 1.1559165716171265, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "learning_rate": 0.00015506943101591038, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.5226683020591736, + "step": 21540 + }, + { + "ce_loss": 0.10446419566869736, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.2686152160167694, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.11150383204221725, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.30404114723205566, + "step": 21540 + }, + { + "ce_loss": 0.0629800483584404, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.13201919198036194, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.0805046334862709, + "step": 21540 + }, + { + "epoch": 7.188125416944629, + "loss": 0.5161, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "grad_norm": 2.5077500343322754, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "learning_rate": 0.00015472792902884237, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.5977288484573364, + "step": 21550 + }, + { + "ce_loss": 0.08067172020673752, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.2463260442018509, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.09701764583587646, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.40372413396835327, + "step": 21550 + }, + { + "ce_loss": 0.08983927965164185, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.21180443465709686, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.07517854869365692, + "step": 21550 + }, + { + "epoch": 7.191460973982655, + "loss": 0.4764, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "grad_norm": 1.196245551109314, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "learning_rate": 0.0001543867133006734, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.464174747467041, + "step": 21560 + }, + { + "ce_loss": 0.08118841797113419, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.239656001329422, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.11102952063083649, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.516892671585083, + "step": 21560 + }, + { + "ce_loss": 0.1302904486656189, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.2542354464530945, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.09391342103481293, + "step": 21560 + }, + { + "epoch": 7.19479653102068, + "loss": 0.4568, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "grad_norm": 1.5264159440994263, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "learning_rate": 0.00015404578422963932, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.84468674659729, + "step": 21570 + }, + { + "ce_loss": 0.07420466095209122, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.18314522504806519, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.08030659705400467, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.5354803800582886, + "step": 21570 + }, + { + "ce_loss": 0.09700847417116165, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.2780064642429352, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.11125379055738449, + "step": 21570 + }, + { + "epoch": 7.198132088058705, + "loss": 0.4866, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "grad_norm": 1.6607636213302612, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "learning_rate": 0.00015370514221364073, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.42872315645217896, + "step": 21580 + }, + { + "ce_loss": 0.10537848621606827, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.19183389842510223, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.1090899258852005, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.9066057205200195, + "step": 21580 + }, + { + "ce_loss": 0.07847505807876587, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.21104788780212402, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.08899524807929993, + "step": 21580 + }, + { + "epoch": 7.201467645096731, + "loss": 0.5644, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "grad_norm": 0.9659031629562378, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "learning_rate": 0.00015336478765024358, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.41692793369293213, + "step": 21590 + }, + { + "ce_loss": 0.09320396929979324, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.21223962306976318, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.08695603907108307, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.31187373399734497, + "step": 21590 + }, + { + "ce_loss": 0.06352602690458298, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.13763265311717987, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.1104658916592598, + "step": 21590 + }, + { + "epoch": 7.204803202134756, + "loss": 0.4597, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "grad_norm": 1.227384328842163, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "learning_rate": 0.00015302472093667828, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.3964717388153076, + "step": 21600 + }, + { + "ce_loss": 0.07394091784954071, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.18971940875053406, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.09033519774675369, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.41900038719177246, + "step": 21600 + }, + { + "ce_loss": 0.09434220939874649, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.1868446320295334, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.0968356803059578, + "step": 21600 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.4705, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "grad_norm": 0.9861631989479065, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "learning_rate": 0.0001526849424698394, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.4758778512477875, + "step": 21610 + }, + { + "ce_loss": 0.08692550659179688, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.236834317445755, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.1145910695195198, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.47692006826400757, + "step": 21610 + }, + { + "ce_loss": 0.07442447543144226, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.21716845035552979, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.09120145440101624, + "step": 21610 + }, + { + "epoch": 7.211474316210807, + "loss": 0.4786, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "grad_norm": 1.013535737991333, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "learning_rate": 0.00015234545264628476, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.47235748171806335, + "step": 21620 + }, + { + "ce_loss": 0.1114814430475235, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.22301355004310608, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.10892931371927261, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.6911946535110474, + "step": 21620 + }, + { + "ce_loss": 0.1308412253856659, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.22310936450958252, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.10248128324747086, + "step": 21620 + }, + { + "epoch": 7.214809873248832, + "loss": 0.4661, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "grad_norm": 1.8655632734298706, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "learning_rate": 0.00015200625186223565, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.5949530005455017, + "step": 21630 + }, + { + "ce_loss": 0.10551103949546814, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.2142055481672287, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.0733252689242363, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.35121750831604004, + "step": 21630 + }, + { + "ce_loss": 0.056218236684799194, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.2145787477493286, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.059078801423311234, + "step": 21630 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.4914, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "grad_norm": 1.7390278577804565, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "learning_rate": 0.00015166734051357577, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.5395673513412476, + "step": 21640 + }, + { + "ce_loss": 0.12450134754180908, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.2334718406200409, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.11392674595117569, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.48015227913856506, + "step": 21640 + }, + { + "ce_loss": 0.07428359240293503, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.26726609468460083, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.09658200293779373, + "step": 21640 + }, + { + "epoch": 7.221480987324883, + "loss": 0.5486, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "grad_norm": 1.0495847463607788, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "learning_rate": 0.00015132871899585138, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.49218320846557617, + "step": 21650 + }, + { + "ce_loss": 0.08046775311231613, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.2417549341917038, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.09568624198436737, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.5142796039581299, + "step": 21650 + }, + { + "ce_loss": 0.10060924291610718, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.2038334608078003, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.10336092859506607, + "step": 21650 + }, + { + "epoch": 7.224816544362908, + "loss": 0.5344, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "grad_norm": 1.4265145063400269, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "learning_rate": 0.00015099038770426994, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.5247296094894409, + "step": 21660 + }, + { + "ce_loss": 0.13266858458518982, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.2536264955997467, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.09633111953735352, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.3984815776348114, + "step": 21660 + }, + { + "ce_loss": 0.06782064586877823, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.21827705204486847, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.08778180181980133, + "step": 21660 + }, + { + "epoch": 7.228152101400934, + "loss": 0.5001, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "grad_norm": 2.303767204284668, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "learning_rate": 0.00015065234703370045, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 0.6092923283576965, + "step": 21670 + }, + { + "ce_loss": 0.1271263211965561, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.28391289710998535, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.1012510359287262, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 1.048234462738037, + "step": 21670 + }, + { + "ce_loss": 0.1372372806072235, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.2900506854057312, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.09884429723024368, + "step": 21670 + }, + { + "epoch": 7.231487658438959, + "loss": 0.5523, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "grad_norm": 1.5447536706924438, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "learning_rate": 0.000150314597378673, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.522541344165802, + "step": 21680 + }, + { + "ce_loss": 0.0930483341217041, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.2165500968694687, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.10621573030948639, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.5200650095939636, + "step": 21680 + }, + { + "ce_loss": 0.1290084421634674, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.26626408100128174, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.0937294289469719, + "step": 21680 + }, + { + "epoch": 7.234823215476984, + "loss": 0.5085, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "grad_norm": 1.0944291353225708, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "learning_rate": 0.00014997713913337784, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.4931850731372833, + "step": 21690 + }, + { + "ce_loss": 0.09169874340295792, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.2615334689617157, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.1397051066160202, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.5440504550933838, + "step": 21690 + }, + { + "ce_loss": 0.14239957928657532, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.2655315399169922, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.11141073703765869, + "step": 21690 + }, + { + "epoch": 7.23815877251501, + "loss": 0.5402, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "grad_norm": 1.2775064706802368, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "learning_rate": 0.00014963997269166473, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.5907175540924072, + "step": 21700 + }, + { + "ce_loss": 0.16186945140361786, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.2696630358695984, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.08794023096561432, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.6781678199768066, + "step": 21700 + }, + { + "ce_loss": 0.1132517084479332, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.2189943492412567, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.11259305477142334, + "step": 21700 + }, + { + "epoch": 7.241494329553035, + "loss": 0.4864, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "grad_norm": 1.0857279300689697, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "learning_rate": 0.00014930309844704334, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.3750140964984894, + "step": 21710 + }, + { + "ce_loss": 0.08555597811937332, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.18509650230407715, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.0733766034245491, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.3149873912334442, + "step": 21710 + }, + { + "ce_loss": 0.05044303089380264, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.16425633430480957, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.07339531928300858, + "step": 21710 + }, + { + "epoch": 7.24482988659106, + "loss": 0.4943, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "grad_norm": 1.3234370946884155, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "learning_rate": 0.00014896651679268219, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.4727824926376343, + "step": 21720 + }, + { + "ce_loss": 0.07473330199718475, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.2146746814250946, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.07707130163908005, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.42264512181282043, + "step": 21720 + }, + { + "ce_loss": 0.06445001810789108, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.18711066246032715, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.08896558731794357, + "step": 21720 + }, + { + "epoch": 7.248165443629086, + "loss": 0.5333, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "grad_norm": 1.3943631649017334, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "learning_rate": 0.0001486302281214084, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.4616813361644745, + "step": 21730 + }, + { + "ce_loss": 0.08090164512395859, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.26880764961242676, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.0775616466999054, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.6101661920547485, + "step": 21730 + }, + { + "ce_loss": 0.08751758933067322, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.22048398852348328, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.09813635051250458, + "step": 21730 + }, + { + "epoch": 7.251501000667111, + "loss": 0.5149, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "grad_norm": 1.2613996267318726, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "learning_rate": 0.0001482942328257067, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.4055621027946472, + "step": 21740 + }, + { + "ce_loss": 0.05802743881940842, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.1827191412448883, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.09838934242725372, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.7149930000305176, + "step": 21740 + }, + { + "ce_loss": 0.15911222994327545, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.24919471144676208, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.11354556679725647, + "step": 21740 + }, + { + "epoch": 7.254836557705136, + "loss": 0.483, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "grad_norm": 1.9538707733154297, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "learning_rate": 0.00014795853129772, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.46994128823280334, + "step": 21750 + }, + { + "ce_loss": 0.10398206859827042, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.21987509727478027, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.08994855731725693, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.5077059864997864, + "step": 21750 + }, + { + "ce_loss": 0.0923563688993454, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.23000693321228027, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.10818903893232346, + "step": 21750 + }, + { + "epoch": 7.258172114743162, + "loss": 0.4675, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "grad_norm": 1.8521201610565186, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "learning_rate": 0.00014762312392924804, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.6129436492919922, + "step": 21760 + }, + { + "ce_loss": 0.10266965627670288, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.22881817817687988, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.08024413883686066, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.5525320172309875, + "step": 21760 + }, + { + "ce_loss": 0.14949725568294525, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.2612256705760956, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.09284612536430359, + "step": 21760 + }, + { + "epoch": 7.261507671781187, + "loss": 0.4865, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "grad_norm": 1.28142511844635, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "learning_rate": 0.00014728801111174754, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.44657081365585327, + "step": 21770 + }, + { + "ce_loss": 0.06146930530667305, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.23916156589984894, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.10164698213338852, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.559298038482666, + "step": 21770 + }, + { + "ce_loss": 0.11093275249004364, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.28344765305519104, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.12337548285722733, + "step": 21770 + }, + { + "epoch": 7.264843228819212, + "loss": 0.4996, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "grad_norm": 1.1702762842178345, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "learning_rate": 0.00014695319323633065, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.5426932573318481, + "step": 21780 + }, + { + "ce_loss": 0.1180221438407898, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.20660299062728882, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.09778185933828354, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.3912394940853119, + "step": 21780 + }, + { + "ce_loss": 0.08908562362194061, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.21106182038784027, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.09095821529626846, + "step": 21780 + }, + { + "epoch": 7.268178785857238, + "loss": 0.4887, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "grad_norm": 1.193352460861206, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "learning_rate": 0.00014661867069376636, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.505956768989563, + "step": 21790 + }, + { + "ce_loss": 0.10640949010848999, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.24152949452400208, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.11776086688041687, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.516710638999939, + "step": 21790 + }, + { + "ce_loss": 0.1165366917848587, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.24122023582458496, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.10701066255569458, + "step": 21790 + }, + { + "epoch": 7.271514342895263, + "loss": 0.4608, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "grad_norm": 2.8318581581115723, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "learning_rate": 0.0001462844438744785, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.559289276599884, + "step": 21800 + }, + { + "ce_loss": 0.1380220353603363, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.21198466420173645, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.1316009908914566, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.329565167427063, + "step": 21800 + }, + { + "ce_loss": 0.04673033952713013, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.17576806247234344, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.07755520194768906, + "step": 21800 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.5114, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "grad_norm": 1.2743655443191528, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "learning_rate": 0.00014595051316854541, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.42324957251548767, + "step": 21810 + }, + { + "ce_loss": 0.09732640534639359, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.21347419917583466, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.08264093846082687, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.40716296434402466, + "step": 21810 + }, + { + "ce_loss": 0.09525058418512344, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.22439442574977875, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.08725903183221817, + "step": 21810 + }, + { + "epoch": 7.278185456971314, + "loss": 0.47, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "grad_norm": 1.347745656967163, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "learning_rate": 0.00014561687896570032, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.41976019740104675, + "step": 21820 + }, + { + "ce_loss": 0.10885760188102722, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.20018614828586578, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.11058416962623596, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.5937715768814087, + "step": 21820 + }, + { + "ce_loss": 0.07900398224592209, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.14981651306152344, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.09780242294073105, + "step": 21820 + }, + { + "epoch": 7.281521014009339, + "loss": 0.5263, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "grad_norm": 1.4404274225234985, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "learning_rate": 0.0001452835416553302, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.5217397212982178, + "step": 21830 + }, + { + "ce_loss": 0.131598562002182, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.2908930778503418, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.09916959702968597, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.5160905122756958, + "step": 21830 + }, + { + "ce_loss": 0.09195408225059509, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.23636744916439056, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.09996692091226578, + "step": 21830 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.4825, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "grad_norm": 1.2493855953216553, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "learning_rate": 0.00014495050162647565, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.38376322388648987, + "step": 21840 + }, + { + "ce_loss": 0.07317432761192322, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.21176281571388245, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.0986928790807724, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.5978377461433411, + "step": 21840 + }, + { + "ce_loss": 0.17858096957206726, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.27734702825546265, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.10186594724655151, + "step": 21840 + }, + { + "epoch": 7.28819212808539, + "loss": 0.5656, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "grad_norm": 1.2831820249557495, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "learning_rate": 0.00014461775926783026, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.5874098539352417, + "step": 21850 + }, + { + "ce_loss": 0.06480753421783447, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.22294045984745026, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.0869978591799736, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.36357828974723816, + "step": 21850 + }, + { + "ce_loss": 0.07771913707256317, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.19707082211971283, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.08844716846942902, + "step": 21850 + }, + { + "epoch": 7.291527685123415, + "loss": 0.4883, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "grad_norm": 1.7892661094665527, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "learning_rate": 0.00014428531496773995, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.44125548005104065, + "step": 21860 + }, + { + "ce_loss": 0.0870680883526802, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.1840522587299347, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.08497877418994904, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.4629088044166565, + "step": 21860 + }, + { + "ce_loss": 0.12965235114097595, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.21723756194114685, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.09699278324842453, + "step": 21860 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.4749, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "grad_norm": 1.915931224822998, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "learning_rate": 0.00014395316911420308, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.42724180221557617, + "step": 21870 + }, + { + "ce_loss": 0.09100129455327988, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.21688386797904968, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.09163907170295715, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.4978054463863373, + "step": 21870 + }, + { + "ce_loss": 0.09396034479141235, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.2810831069946289, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.092870332300663, + "step": 21870 + }, + { + "epoch": 7.298198799199466, + "loss": 0.5138, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "grad_norm": 1.2401480674743652, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "learning_rate": 0.00014362132209486968, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.473979651927948, + "step": 21880 + }, + { + "ce_loss": 0.15663237869739532, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.20220325887203217, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.09212686121463776, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.5248937010765076, + "step": 21880 + }, + { + "ce_loss": 0.09290619939565659, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.22812524437904358, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.10141429305076599, + "step": 21880 + }, + { + "epoch": 7.301534356237491, + "loss": 0.4731, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "grad_norm": 1.250961422920227, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "learning_rate": 0.00014328977429704085, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.4021410346031189, + "step": 21890 + }, + { + "ce_loss": 0.056410495191812515, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.23186610639095306, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.091607004404068, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.5981037020683289, + "step": 21890 + }, + { + "ce_loss": 0.08761950582265854, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.17893314361572266, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.09102575480937958, + "step": 21890 + }, + { + "epoch": 7.304869913275517, + "loss": 0.5045, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "grad_norm": 1.5320053100585938, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "learning_rate": 0.0001429585261076686, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.4536799192428589, + "step": 21900 + }, + { + "ce_loss": 0.08082486689090729, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.18056809902191162, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.10206877440214157, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.5019956827163696, + "step": 21900 + }, + { + "ce_loss": 0.11438484489917755, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.21802611649036407, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.1281689703464508, + "step": 21900 + }, + { + "epoch": 7.308205470313542, + "loss": 0.5188, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "grad_norm": 2.2819159030914307, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "learning_rate": 0.00014262757791335515, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.5231117010116577, + "step": 21910 + }, + { + "ce_loss": 0.12724162638187408, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.22880399227142334, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.0907425805926323, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.6519007086753845, + "step": 21910 + }, + { + "ce_loss": 0.0975770652294159, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.24211576581001282, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.08263137191534042, + "step": 21910 + }, + { + "epoch": 7.311541027351567, + "loss": 0.4677, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "grad_norm": 1.6301801204681396, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "learning_rate": 0.00014229693010035286, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.5612131357192993, + "step": 21920 + }, + { + "ce_loss": 0.08595165610313416, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.259377121925354, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.1124444529414177, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.6012833714485168, + "step": 21920 + }, + { + "ce_loss": 0.11187417060136795, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.18224146962165833, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.11612199246883392, + "step": 21920 + }, + { + "epoch": 7.314876584389593, + "loss": 0.5822, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "grad_norm": 1.509248971939087, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "learning_rate": 0.00014196658305456303, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 1.277998685836792, + "step": 21930 + }, + { + "ce_loss": 0.10564474016427994, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.23719510436058044, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.09459588676691055, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 0.37890195846557617, + "step": 21930 + }, + { + "ce_loss": 0.07021559029817581, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.17815831303596497, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.0899161696434021, + "step": 21930 + }, + { + "epoch": 7.318212141427618, + "loss": 0.5154, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "grad_norm": 1.338369607925415, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "learning_rate": 0.00014163653716153628, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.4284664988517761, + "step": 21940 + }, + { + "ce_loss": 0.09836511313915253, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.21835702657699585, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.11144930124282837, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.39852064847946167, + "step": 21940 + }, + { + "ce_loss": 0.0820104330778122, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.20707827806472778, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.07728994637727737, + "step": 21940 + }, + { + "epoch": 7.321547698465643, + "loss": 0.4578, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "grad_norm": 2.4161667823791504, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "learning_rate": 0.00014130679280647174, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.564189612865448, + "step": 21950 + }, + { + "ce_loss": 0.1051364466547966, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.20676258206367493, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.07331159710884094, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.4392548203468323, + "step": 21950 + }, + { + "ce_loss": 0.1069713830947876, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.2125968188047409, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.0849190428853035, + "step": 21950 + }, + { + "epoch": 7.324883255503669, + "loss": 0.5103, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "grad_norm": 1.7507930994033813, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "learning_rate": 0.00014097735037421668, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.5139173269271851, + "step": 21960 + }, + { + "ce_loss": 0.07396847009658813, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.26365751028060913, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.09375769644975662, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.5542353391647339, + "step": 21960 + }, + { + "ce_loss": 0.141982302069664, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.20985393226146698, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.1077088788151741, + "step": 21960 + }, + { + "epoch": 7.328218812541694, + "loss": 0.5147, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "grad_norm": 3.0060346126556396, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "learning_rate": 0.00014064821024926553, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.534508228302002, + "step": 21970 + }, + { + "ce_loss": 0.14316098392009735, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.25907862186431885, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.11218282580375671, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.5390993356704712, + "step": 21970 + }, + { + "ce_loss": 0.09973648935556412, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.22498083114624023, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.10661143809556961, + "step": 21970 + }, + { + "epoch": 7.331554369579719, + "loss": 0.5304, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "grad_norm": 1.9831620454788208, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "learning_rate": 0.0001403193728157605, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.5298435091972351, + "step": 21980 + }, + { + "ce_loss": 0.0789138600230217, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.25244104862213135, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.08395465463399887, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.5483510494232178, + "step": 21980 + }, + { + "ce_loss": 0.11609560996294022, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.2503870725631714, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.10731753706932068, + "step": 21980 + }, + { + "epoch": 7.334889926617745, + "loss": 0.4871, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "grad_norm": 1.3999574184417725, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "learning_rate": 0.00013999083845749012, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.43087154626846313, + "step": 21990 + }, + { + "ce_loss": 0.07881233841180801, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.18328407406806946, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.10450533032417297, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.3638628423213959, + "step": 21990 + }, + { + "ce_loss": 0.051684122532606125, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.16024218499660492, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.07850784063339233, + "step": 21990 + }, + { + "epoch": 7.33822548365577, + "loss": 0.4752, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "grad_norm": 2.5212299823760986, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "learning_rate": 0.00013966260755788947, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 1.059046983718872, + "step": 22000 + }, + { + "ce_loss": 0.12717647850513458, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.31576603651046753, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.08922392129898071, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 0.354206919670105, + "step": 22000 + }, + { + "ce_loss": 0.05530741438269615, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.13689683377742767, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.06636778265237808, + "step": 22000 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.5179, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "grad_norm": 1.3943521976470947, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "learning_rate": 0.00013933468050003923, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.43112093210220337, + "step": 22010 + }, + { + "ce_loss": 0.08019595593214035, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.2375338226556778, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.11313319206237793, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.6591454148292542, + "step": 22010 + }, + { + "ce_loss": 0.10027246177196503, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.2757124900817871, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.07839225232601166, + "step": 22010 + }, + { + "epoch": 7.344896597731822, + "loss": 0.5175, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "grad_norm": 1.3006362915039062, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "learning_rate": 0.0001390070576666656, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.40917250514030457, + "step": 22020 + }, + { + "ce_loss": 0.08095649629831314, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.22681719064712524, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.06790400296449661, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.5089716911315918, + "step": 22020 + }, + { + "ce_loss": 0.11554168909788132, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.2949783504009247, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.09493078291416168, + "step": 22020 + }, + { + "epoch": 7.348232154769846, + "loss": 0.4836, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "grad_norm": 1.0829044580459595, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "learning_rate": 0.00013867973944013966, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.6953809261322021, + "step": 22030 + }, + { + "ce_loss": 0.15069177746772766, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.2764919698238373, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.10267274081707001, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.47006797790527344, + "step": 22030 + }, + { + "ce_loss": 0.11325942724943161, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.20543146133422852, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.12529228627681732, + "step": 22030 + }, + { + "epoch": 7.351567711807872, + "loss": 0.5068, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "grad_norm": 1.3046250343322754, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "learning_rate": 0.00013835272620247717, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.3885897696018219, + "step": 22040 + }, + { + "ce_loss": 0.07588858157396317, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.2265595644712448, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.08590566366910934, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.41721630096435547, + "step": 22040 + }, + { + "ce_loss": 0.10608571022748947, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.21725106239318848, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.09329008311033249, + "step": 22040 + }, + { + "epoch": 7.354903268845897, + "loss": 0.5419, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "grad_norm": 1.8390084505081177, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "learning_rate": 0.00013802601833533745, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.40415939688682556, + "step": 22050 + }, + { + "ce_loss": 0.08732908219099045, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.20429277420043945, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.11229567974805832, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.35471150279045105, + "step": 22050 + }, + { + "ce_loss": 0.05761922150850296, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.16647978127002716, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.09119325131177902, + "step": 22050 + }, + { + "epoch": 7.358238825883923, + "loss": 0.5271, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "grad_norm": 1.500783920288086, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "learning_rate": 0.00013769961622002393, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.4502326548099518, + "step": 22060 + }, + { + "ce_loss": 0.09496644884347916, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.21306586265563965, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.08571892231702805, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.4776359796524048, + "step": 22060 + }, + { + "ce_loss": 0.1146068349480629, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.2597961127758026, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.07996554672718048, + "step": 22060 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.5446, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "grad_norm": 3.993879556655884, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "learning_rate": 0.000137373520237483, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.4738246500492096, + "step": 22070 + }, + { + "ce_loss": 0.08311747014522552, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.19675718247890472, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.10863441973924637, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.7610086798667908, + "step": 22070 + }, + { + "ce_loss": 0.13840071856975555, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.2668496072292328, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.09363804757595062, + "step": 22070 + }, + { + "epoch": 7.364909939959974, + "loss": 0.5033, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "grad_norm": 2.0557427406311035, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "learning_rate": 0.00013704773076830378, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 0.7582910656929016, + "step": 22080 + }, + { + "ce_loss": 0.1402321755886078, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.28201544284820557, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.0874849334359169, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 0.7125459313392639, + "step": 22080 + }, + { + "ce_loss": 0.10863231867551804, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.2794286906719208, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.08346754312515259, + "step": 22080 + }, + { + "epoch": 7.368245496997998, + "loss": 0.5462, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "grad_norm": 1.49296236038208, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "learning_rate": 0.0001367222481927175, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.4915519952774048, + "step": 22090 + }, + { + "ce_loss": 0.09840986132621765, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.20584022998809814, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.1020207479596138, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.40518438816070557, + "step": 22090 + }, + { + "ce_loss": 0.057568494230508804, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.21791799366474152, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.08963776379823685, + "step": 22090 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.4666, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "grad_norm": 1.6419566869735718, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "learning_rate": 0.0001363970728905975, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.652350127696991, + "step": 22100 + }, + { + "ce_loss": 0.08931543678045273, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.29681864380836487, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.09608305990695953, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.34036093950271606, + "step": 22100 + }, + { + "ce_loss": 0.04965618625283241, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.18944749236106873, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.08181827515363693, + "step": 22100 + }, + { + "epoch": 7.374916611074049, + "loss": 0.513, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "grad_norm": 1.180824637413025, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "learning_rate": 0.0001360722052414582, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.5612049102783203, + "step": 22110 + }, + { + "ce_loss": 0.1324518769979477, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.2755542993545532, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.12568174302577972, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.576231062412262, + "step": 22110 + }, + { + "ce_loss": 0.11681327223777771, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.21474020183086395, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.11753689497709274, + "step": 22110 + }, + { + "epoch": 7.378252168112075, + "loss": 0.5058, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "grad_norm": 1.7695060968399048, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "learning_rate": 0.0001357476456244552, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.39721181988716125, + "step": 22120 + }, + { + "ce_loss": 0.0658101886510849, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.22495456039905548, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.08024078607559204, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.5268717408180237, + "step": 22120 + }, + { + "ce_loss": 0.1184745505452156, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.25744789838790894, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.0897599533200264, + "step": 22120 + }, + { + "epoch": 7.3815877251501, + "loss": 0.4758, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "grad_norm": 1.3974146842956543, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "learning_rate": 0.00013542339441838453, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.4541488587856293, + "step": 22130 + }, + { + "ce_loss": 0.0709289163351059, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.2584846615791321, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.09032522141933441, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.7902992367744446, + "step": 22130 + }, + { + "ce_loss": 0.07666203379631042, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.26494210958480835, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.09109896421432495, + "step": 22130 + }, + { + "epoch": 7.384923282188126, + "loss": 0.4863, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "grad_norm": 1.9298646450042725, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "learning_rate": 0.00013509945200168217, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.3699907064437866, + "step": 22140 + }, + { + "ce_loss": 0.0853923037648201, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.175503671169281, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.10878586769104004, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.49869394302368164, + "step": 22140 + }, + { + "ce_loss": 0.10805847495794296, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.2761743664741516, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.09387902915477753, + "step": 22140 + }, + { + "epoch": 7.38825883922615, + "loss": 0.5507, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "grad_norm": 0.9105316400527954, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "learning_rate": 0.00013477581875242391, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.5569889545440674, + "step": 22150 + }, + { + "ce_loss": 0.09075947850942612, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.20240755379199982, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.09280704706907272, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.40518975257873535, + "step": 22150 + }, + { + "ce_loss": 0.06980345398187637, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.2161484658718109, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.0939449593424797, + "step": 22150 + }, + { + "epoch": 7.391594396264177, + "loss": 0.5112, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "grad_norm": 1.392278790473938, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "learning_rate": 0.00013445249504832435, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.4864904284477234, + "step": 22160 + }, + { + "ce_loss": 0.1145547479391098, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.1910143345594406, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.09960886090993881, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.4335716962814331, + "step": 22160 + }, + { + "ce_loss": 0.08965346217155457, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.1827930063009262, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.13129045069217682, + "step": 22160 + }, + { + "epoch": 7.394929953302201, + "loss": 0.521, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "grad_norm": 1.4772255420684814, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "learning_rate": 0.00013412948126673716, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.48208677768707275, + "step": 22170 + }, + { + "ce_loss": 0.09089218825101852, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.18222153186798096, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.08755753189325333, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.4618496000766754, + "step": 22170 + }, + { + "ce_loss": 0.08060234785079956, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.21322056651115417, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.09799622744321823, + "step": 22170 + }, + { + "epoch": 7.398265510340227, + "loss": 0.4817, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "grad_norm": 1.0283700227737427, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "learning_rate": 0.00013380677778465421, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.5956416130065918, + "step": 22180 + }, + { + "ce_loss": 0.1274511069059372, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.23653876781463623, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.12005682289600372, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.5424402952194214, + "step": 22180 + }, + { + "ce_loss": 0.09740731120109558, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.2374219298362732, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.12457883358001709, + "step": 22180 + }, + { + "epoch": 7.401601067378252, + "loss": 0.4967, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "grad_norm": 1.1883015632629395, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "learning_rate": 0.00013348438497870518, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.356072336435318, + "step": 22190 + }, + { + "ce_loss": 0.044157691299915314, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.2180037796497345, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.0936390832066536, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.47434988617897034, + "step": 22190 + }, + { + "ce_loss": 0.10346195101737976, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.23020198941230774, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.10282209515571594, + "step": 22190 + }, + { + "epoch": 7.404936624416278, + "loss": 0.479, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "grad_norm": 1.337052583694458, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "learning_rate": 0.0001331623032251572, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.38373318314552307, + "step": 22200 + }, + { + "ce_loss": 0.1033421978354454, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.17759321630001068, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.1023772805929184, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.5399565100669861, + "step": 22200 + }, + { + "ce_loss": 0.10520821809768677, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.20086219906806946, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.1283208280801773, + "step": 22200 + }, + { + "epoch": 7.408272181454302, + "loss": 0.5298, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "grad_norm": 1.4969794750213623, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "learning_rate": 0.00013284053289991423, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.4264291822910309, + "step": 22210 + }, + { + "ce_loss": 0.07203280925750732, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.21674410998821259, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.08955416083335876, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.46230363845825195, + "step": 22210 + }, + { + "ce_loss": 0.07309994846582413, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.2028307467699051, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.09434698522090912, + "step": 22210 + }, + { + "epoch": 7.411607738492329, + "loss": 0.4573, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "grad_norm": 1.280541181564331, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "learning_rate": 0.00013251907437851674, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.3875102698802948, + "step": 22220 + }, + { + "ce_loss": 0.08745839446783066, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.19196641445159912, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.10794602334499359, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.7894054651260376, + "step": 22220 + }, + { + "ce_loss": 0.07735282927751541, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.20096616446971893, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.08367619663476944, + "step": 22220 + }, + { + "epoch": 7.414943295530353, + "loss": 0.5061, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "grad_norm": 1.6839677095413208, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "learning_rate": 0.00013219792803614183, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.43914952874183655, + "step": 22230 + }, + { + "ce_loss": 0.11902012676000595, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.2054997980594635, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.060020480304956436, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.8430578708648682, + "step": 22230 + }, + { + "ce_loss": 0.17063359916210175, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.29544597864151, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.1225108951330185, + "step": 22230 + }, + { + "epoch": 7.418278852568379, + "loss": 0.5245, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "grad_norm": 1.309335470199585, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "learning_rate": 0.00013187709424760153, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.5299336910247803, + "step": 22240 + }, + { + "ce_loss": 0.0785478726029396, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.30060285329818726, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.11685214936733246, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.3738112449645996, + "step": 22240 + }, + { + "ce_loss": 0.07748304307460785, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.22111056745052338, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.07467032223939896, + "step": 22240 + }, + { + "epoch": 7.421614409606404, + "loss": 0.4995, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "grad_norm": 1.4998759031295776, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "learning_rate": 0.0001315565733873434, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.6699790358543396, + "step": 22250 + }, + { + "ce_loss": 0.05923834070563316, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.17364928126335144, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.07846441119909286, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.4596516788005829, + "step": 22250 + }, + { + "ce_loss": 0.0986219048500061, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.24059095978736877, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.09746402502059937, + "step": 22250 + }, + { + "epoch": 7.42494996664443, + "loss": 0.4921, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "grad_norm": 1.2402822971343994, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "learning_rate": 0.00013123636582944984, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.500768780708313, + "step": 22260 + }, + { + "ce_loss": 0.10541671514511108, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.22422003746032715, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.09545684605836868, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.47601836919784546, + "step": 22260 + }, + { + "ce_loss": 0.086974136531353, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.20608215034008026, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.09012197703123093, + "step": 22260 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.4806, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "grad_norm": 1.1672219038009644, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "learning_rate": 0.00013091647194763767, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.3636277914047241, + "step": 22270 + }, + { + "ce_loss": 0.05640361085534096, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.15501046180725098, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.103425532579422, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.4259013235569, + "step": 22270 + }, + { + "ce_loss": 0.07275062799453735, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.19085094332695007, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.0643162727355957, + "step": 22270 + }, + { + "epoch": 7.431621080720481, + "loss": 0.4486, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "grad_norm": 1.9622478485107422, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "learning_rate": 0.0001305968921152572, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.4420868456363678, + "step": 22280 + }, + { + "ce_loss": 0.13633877038955688, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.2052733451128006, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.08635461330413818, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.47934165596961975, + "step": 22280 + }, + { + "ce_loss": 0.08335137367248535, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.2924102246761322, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.10293826460838318, + "step": 22280 + }, + { + "epoch": 7.434956637758505, + "loss": 0.5194, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "grad_norm": 1.7894092798233032, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "learning_rate": 0.00013027762670529263, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.5704688429832458, + "step": 22290 + }, + { + "ce_loss": 0.10463081300258636, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.22579312324523926, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.06787216663360596, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.3507632911205292, + "step": 22290 + }, + { + "ce_loss": 0.08278298377990723, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.19265946745872498, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.07519949972629547, + "step": 22290 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.4832, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "grad_norm": 1.4802080392837524, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "learning_rate": 0.00012995867609036097, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.40042388439178467, + "step": 22300 + }, + { + "ce_loss": 0.09222708642482758, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.20333006978034973, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.06465158611536026, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.4406663775444031, + "step": 22300 + }, + { + "ce_loss": 0.08464495092630386, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.22818736732006073, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.09121893346309662, + "step": 22300 + }, + { + "epoch": 7.441627751834556, + "loss": 0.5065, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "grad_norm": 1.4099823236465454, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "learning_rate": 0.0001296400406427121, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.5115585327148438, + "step": 22310 + }, + { + "ce_loss": 0.10224932432174683, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.19686448574066162, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.10360269248485565, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.44680503010749817, + "step": 22310 + }, + { + "ce_loss": 0.08968012034893036, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.22415149211883545, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.09783569723367691, + "step": 22310 + }, + { + "epoch": 7.444963308872582, + "loss": 0.4535, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "grad_norm": 1.3058733940124512, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "learning_rate": 0.00012932172073422765, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.3406934142112732, + "step": 22320 + }, + { + "ce_loss": 0.07440505921840668, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.16044959425926208, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.06553249061107635, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.4342862069606781, + "step": 22320 + }, + { + "ce_loss": 0.09907007962465286, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.2205602079629898, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.08764271438121796, + "step": 22320 + }, + { + "epoch": 7.448298865910607, + "loss": 0.4604, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "grad_norm": 4.206222057342529, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "learning_rate": 0.00012900371673642112, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.5938883423805237, + "step": 22330 + }, + { + "ce_loss": 0.1305702179670334, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.3110573887825012, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.11513999104499817, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.4503992199897766, + "step": 22330 + }, + { + "ce_loss": 0.10052350908517838, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.18781507015228271, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.11676362156867981, + "step": 22330 + }, + { + "epoch": 7.451634422948633, + "loss": 0.5262, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "grad_norm": 3.2831857204437256, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "learning_rate": 0.00012868602902043783, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.3693366050720215, + "step": 22340 + }, + { + "ce_loss": 0.06226786598563194, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.18790769577026367, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.07951121777296066, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.3679620921611786, + "step": 22340 + }, + { + "ce_loss": 0.08266917616128922, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.19653236865997314, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.08857674896717072, + "step": 22340 + }, + { + "epoch": 7.454969979986657, + "loss": 0.4677, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "grad_norm": 1.5119948387145996, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "learning_rate": 0.00012836865795705314, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.43310967087745667, + "step": 22350 + }, + { + "ce_loss": 0.09349098801612854, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.2101289927959442, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.08622722327709198, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.4472948908805847, + "step": 22350 + }, + { + "ce_loss": 0.08247264474630356, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.20638509094715118, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.11921993643045425, + "step": 22350 + }, + { + "epoch": 7.458305537024684, + "loss": 0.4832, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "grad_norm": 1.893993854522705, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "learning_rate": 0.00012805160391667338, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.43647223711013794, + "step": 22360 + }, + { + "ce_loss": 0.10963376611471176, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.2051556259393692, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.09101174026727676, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.3410559594631195, + "step": 22360 + }, + { + "ce_loss": 0.06607978045940399, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.19104278087615967, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.08367554843425751, + "step": 22360 + }, + { + "epoch": 7.461641094062708, + "loss": 0.513, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "grad_norm": 1.6382286548614502, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "learning_rate": 0.00012773486726933467, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.6054667830467224, + "step": 22370 + }, + { + "ce_loss": 0.09302318841218948, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.22380737960338593, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.09679944068193436, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.5648404359817505, + "step": 22370 + }, + { + "ce_loss": 0.11022025346755981, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.2605384588241577, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.09326773136854172, + "step": 22370 + }, + { + "epoch": 7.464976651100734, + "loss": 0.5024, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "grad_norm": 1.2432814836502075, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "learning_rate": 0.00012741844838470284, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.5672869682312012, + "step": 22380 + }, + { + "ce_loss": 0.13024158775806427, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.2523437738418579, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.10419787466526031, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.3623162806034088, + "step": 22380 + }, + { + "ce_loss": 0.0839012935757637, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.15986678004264832, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.06418856233358383, + "step": 22380 + }, + { + "epoch": 7.468312208138759, + "loss": 0.4916, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "grad_norm": 1.126115322113037, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "learning_rate": 0.00012710234763207282, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.4593971073627472, + "step": 22390 + }, + { + "ce_loss": 0.07878857105970383, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.16794341802597046, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.09075488150119781, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.7162830829620361, + "step": 22390 + }, + { + "ce_loss": 0.11342699080705643, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.2433580458164215, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.10004798322916031, + "step": 22390 + }, + { + "epoch": 7.471647765176785, + "loss": 0.4843, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "grad_norm": 1.6590230464935303, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "learning_rate": 0.00012678656538036803, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.5215333700180054, + "step": 22400 + }, + { + "ce_loss": 0.11010608822107315, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.23093946278095245, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.09201352298259735, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.41143471002578735, + "step": 22400 + }, + { + "ce_loss": 0.05778210982680321, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.20460692048072815, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.10184299200773239, + "step": 22400 + }, + { + "epoch": 7.474983322214809, + "loss": 0.488, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "grad_norm": 1.2910398244857788, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "learning_rate": 0.0001264711019981404, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.3339017331600189, + "step": 22410 + }, + { + "ce_loss": 0.050458505749702454, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.17623841762542725, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.06894265860319138, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.7849916815757751, + "step": 22410 + }, + { + "ce_loss": 0.13765177130699158, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.2127397358417511, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.1041124016046524, + "step": 22410 + }, + { + "epoch": 7.478318879252836, + "loss": 0.4916, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "grad_norm": 1.5999616384506226, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "learning_rate": 0.00012615595785356963, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.29595646262168884, + "step": 22420 + }, + { + "ce_loss": 0.05818495899438858, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.15219548344612122, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.08528938889503479, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.3686578869819641, + "step": 22420 + }, + { + "ce_loss": 0.08551814407110214, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.17895744740962982, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.07805319130420685, + "step": 22420 + }, + { + "epoch": 7.48165443629086, + "loss": 0.4361, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "grad_norm": 1.2545150518417358, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "learning_rate": 0.00012584113331446303, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.3689488470554352, + "step": 22430 + }, + { + "ce_loss": 0.06294714659452438, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.16390585899353027, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.09834092855453491, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.4330061674118042, + "step": 22430 + }, + { + "ce_loss": 0.0886654183268547, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.2353941649198532, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.0741928294301033, + "step": 22430 + }, + { + "epoch": 7.484989993328886, + "loss": 0.4324, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "grad_norm": 1.5502991676330566, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "learning_rate": 0.00012552662874825432, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.7424986362457275, + "step": 22440 + }, + { + "ce_loss": 0.0946795865893364, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.20105278491973877, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.10257663577795029, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.4984404444694519, + "step": 22440 + }, + { + "ce_loss": 0.09149035066366196, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.28340229392051697, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.12336947023868561, + "step": 22440 + }, + { + "epoch": 7.488325550366911, + "loss": 0.4556, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "grad_norm": 2.0209858417510986, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "learning_rate": 0.00012521244452200455, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.47479161620140076, + "step": 22450 + }, + { + "ce_loss": 0.08662200719118118, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.2342468798160553, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.0724748894572258, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.5271512866020203, + "step": 22450 + }, + { + "ce_loss": 0.09744291752576828, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.23693248629570007, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.1411687731742859, + "step": 22450 + }, + { + "epoch": 7.491661107404937, + "loss": 0.4946, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "grad_norm": 1.4775241613388062, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "learning_rate": 0.0001248985810024005, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.58427894115448, + "step": 22460 + }, + { + "ce_loss": 0.09811204671859741, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.19793188571929932, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.09271074831485748, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.4697480797767639, + "step": 22460 + }, + { + "ce_loss": 0.05923609435558319, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.23396554589271545, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.10728515684604645, + "step": 22460 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.4636, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "grad_norm": 1.045827865600586, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "learning_rate": 0.00012458503855575446, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.5289336442947388, + "step": 22470 + }, + { + "ce_loss": 0.06923147290945053, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.16591979563236237, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.12149394303560257, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.5726809501647949, + "step": 22470 + }, + { + "ce_loss": 0.16617202758789062, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.29169028997421265, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.09331735968589783, + "step": 22470 + }, + { + "epoch": 7.498332221480988, + "loss": 0.4868, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "grad_norm": 1.160749912261963, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "learning_rate": 0.0001242718175480043, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.39735355973243713, + "step": 22480 + }, + { + "ce_loss": 0.0984911397099495, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.2119746208190918, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.06830558180809021, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.44706398248672485, + "step": 22480 + }, + { + "ce_loss": 0.10186761617660522, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.2096789926290512, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.09839505702257156, + "step": 22480 + }, + { + "epoch": 7.501667778519012, + "loss": 0.512, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "grad_norm": 1.7275261878967285, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "learning_rate": 0.0001239589183447126, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.5226195454597473, + "step": 22490 + }, + { + "ce_loss": 0.15770918130874634, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.25905734300613403, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.10564803332090378, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.4982665181159973, + "step": 22490 + }, + { + "ce_loss": 0.12002474814653397, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.2492271214723587, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.08589009195566177, + "step": 22490 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.5313, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "grad_norm": 1.6123205423355103, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "learning_rate": 0.00012364634131106664, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.33314186334609985, + "step": 22500 + }, + { + "ce_loss": 0.06740890443325043, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.16940796375274658, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.09602712094783783, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.4375060200691223, + "step": 22500 + }, + { + "ce_loss": 0.08420051634311676, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.17328374087810516, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.09019183367490768, + "step": 22500 + }, + { + "epoch": 7.508338892595063, + "loss": 0.5171, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "grad_norm": 0.948650598526001, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "learning_rate": 0.00012333408681187709, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.41246312856674194, + "step": 22510 + }, + { + "ce_loss": 0.09295927733182907, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.2076326459646225, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.07924635708332062, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.3275887370109558, + "step": 22510 + }, + { + "ce_loss": 0.04823889583349228, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.1737799495458603, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.07115225493907928, + "step": 22510 + }, + { + "epoch": 7.511674449633089, + "loss": 0.4746, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "grad_norm": 1.3547013998031616, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "learning_rate": 0.00012302215521157867, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.48953017592430115, + "step": 22520 + }, + { + "ce_loss": 0.1005236953496933, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.27121877670288086, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.09814045578241348, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.43054911494255066, + "step": 22520 + }, + { + "ce_loss": 0.08458945900201797, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.22104483842849731, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.08826296031475067, + "step": 22520 + }, + { + "epoch": 7.515010006671114, + "loss": 0.506, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "grad_norm": 2.74025821685791, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "learning_rate": 0.0001227105468742292, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.7157801389694214, + "step": 22530 + }, + { + "ce_loss": 0.11807220429182053, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.25563985109329224, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.1178126335144043, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.3893582820892334, + "step": 22530 + }, + { + "ce_loss": 0.08098858594894409, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.17783185839653015, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.10213252902030945, + "step": 22530 + }, + { + "epoch": 7.51834556370914, + "loss": 0.5392, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "grad_norm": 1.3929061889648438, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "learning_rate": 0.00012239926216350928, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.5459983348846436, + "step": 22540 + }, + { + "ce_loss": 0.11914133280515671, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.24583551287651062, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.10463111847639084, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.3390716016292572, + "step": 22540 + }, + { + "ce_loss": 0.056259576231241226, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.1948683112859726, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.08741886168718338, + "step": 22540 + }, + { + "epoch": 7.521681120747164, + "loss": 0.45, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "grad_norm": 1.3042694330215454, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "learning_rate": 0.00012208830144272117, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.34747618436813354, + "step": 22550 + }, + { + "ce_loss": 0.0798603743314743, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.1654832810163498, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.06433495879173279, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.39804723858833313, + "step": 22550 + }, + { + "ce_loss": 0.11774889379739761, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.19477730989456177, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.08538540452718735, + "step": 22550 + }, + { + "epoch": 7.525016677785191, + "loss": 0.4811, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "grad_norm": 1.6051409244537354, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "learning_rate": 0.00012177766507478998, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.4381016790866852, + "step": 22560 + }, + { + "ce_loss": 0.09013054519891739, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.21436019241809845, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.0902925506234169, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.4647827744483948, + "step": 22560 + }, + { + "ce_loss": 0.08238541334867477, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.24512982368469238, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.10200794041156769, + "step": 22560 + }, + { + "epoch": 7.528352234823215, + "loss": 0.4775, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "grad_norm": 1.4340500831604004, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "learning_rate": 0.00012146735342226158, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.6594744920730591, + "step": 22570 + }, + { + "ce_loss": 0.14710494875907898, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.23049712181091309, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.10443130135536194, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.5185823440551758, + "step": 22570 + }, + { + "ce_loss": 0.07006262242794037, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.23122458159923553, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.11255393922328949, + "step": 22570 + }, + { + "epoch": 7.531687791861241, + "loss": 0.5173, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "grad_norm": 1.1945619583129883, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "learning_rate": 0.00012115736684730326, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.58127361536026, + "step": 22580 + }, + { + "ce_loss": 0.06672295182943344, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.2515077292919159, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.08761141449213028, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.3402024507522583, + "step": 22580 + }, + { + "ce_loss": 0.05126715078949928, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.18758653104305267, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.06539114564657211, + "step": 22580 + }, + { + "epoch": 7.535023348899266, + "loss": 0.4659, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "grad_norm": 1.089540958404541, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "learning_rate": 0.00012084770571170234, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.5159248113632202, + "step": 22590 + }, + { + "ce_loss": 0.10363738983869553, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.2532828152179718, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.11867432296276093, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.3509080111980438, + "step": 22590 + }, + { + "ce_loss": 0.04871860146522522, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.17586302757263184, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.07077796012163162, + "step": 22590 + }, + { + "epoch": 7.538358905937292, + "loss": 0.4991, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "grad_norm": 1.2875605821609497, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "learning_rate": 0.00012053837037686694, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.4832446575164795, + "step": 22600 + }, + { + "ce_loss": 0.09857060760259628, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.2361626923084259, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.11466442048549652, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.5136521458625793, + "step": 22600 + }, + { + "ce_loss": 0.09627451747655869, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.2690471410751343, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.1010148823261261, + "step": 22600 + }, + { + "epoch": 7.541694462975316, + "loss": 0.4601, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "grad_norm": 1.381775140762329, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "learning_rate": 0.00012022936120382464, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.5695637464523315, + "step": 22610 + }, + { + "ce_loss": 0.09625787287950516, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.2347060889005661, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.08520832657814026, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.6165584921836853, + "step": 22610 + }, + { + "ce_loss": 0.13299477100372314, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.30004408955574036, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.09910961985588074, + "step": 22610 + }, + { + "epoch": 7.545030020013343, + "loss": 0.5293, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "grad_norm": 1.9388092756271362, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "learning_rate": 0.00011992067855322248, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.4749663472175598, + "step": 22620 + }, + { + "ce_loss": 0.09989020228385925, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.24128346145153046, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.09536205977201462, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.4751257598400116, + "step": 22620 + }, + { + "ce_loss": 0.09537438303232193, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.2695975601673126, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.10993235558271408, + "step": 22620 + }, + { + "epoch": 7.548365577051367, + "loss": 0.4868, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "grad_norm": 1.4406224489212036, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "learning_rate": 0.00011961232278532608, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.36910074949264526, + "step": 22630 + }, + { + "ce_loss": 0.07736709713935852, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.20966193079948425, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.0818030834197998, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.4263499975204468, + "step": 22630 + }, + { + "ce_loss": 0.07568740099668503, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.22069811820983887, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.07900112122297287, + "step": 22630 + }, + { + "epoch": 7.551701134089393, + "loss": 0.5055, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "grad_norm": 1.8832731246948242, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "learning_rate": 0.00011930429426001999, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.5240845084190369, + "step": 22640 + }, + { + "ce_loss": 0.12520739436149597, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.215688556432724, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.15145321190357208, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.37638574838638306, + "step": 22640 + }, + { + "ce_loss": 0.06634283065795898, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.21147775650024414, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.08255521953105927, + "step": 22640 + }, + { + "epoch": 7.555036691127418, + "loss": 0.5192, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "grad_norm": 1.0957404375076294, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "learning_rate": 0.00011899659333680659, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.3762626349925995, + "step": 22650 + }, + { + "ce_loss": 0.0665406882762909, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.20673134922981262, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.06498983502388, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.7602916955947876, + "step": 22650 + }, + { + "ce_loss": 0.11462393403053284, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.22196552157402039, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.10610775649547577, + "step": 22650 + }, + { + "epoch": 7.558372248165444, + "loss": 0.5195, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "grad_norm": 1.423340082168579, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "learning_rate": 0.00011868922037480601, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.41893377900123596, + "step": 22660 + }, + { + "ce_loss": 0.08694668114185333, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.2006167471408844, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.09499073773622513, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.5745624899864197, + "step": 22660 + }, + { + "ce_loss": 0.10483787953853607, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.22017048299312592, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.09431114047765732, + "step": 22660 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.4536, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "grad_norm": 1.5026270151138306, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "learning_rate": 0.0001183821757327555, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.6381749510765076, + "step": 22670 + }, + { + "ce_loss": 0.14865674078464508, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.3198041021823883, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.11032967269420624, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.44358402490615845, + "step": 22670 + }, + { + "ce_loss": 0.12820997834205627, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.21085762977600098, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.0794897973537445, + "step": 22670 + }, + { + "epoch": 7.565043362241495, + "loss": 0.5031, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "grad_norm": 1.253045916557312, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "learning_rate": 0.00011807545976900929, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.5187609195709229, + "step": 22680 + }, + { + "ce_loss": 0.13985517621040344, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.264445424079895, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.11417710781097412, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.5495779514312744, + "step": 22680 + }, + { + "ce_loss": 0.09258265048265457, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.26880165934562683, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.10621566325426102, + "step": 22680 + }, + { + "epoch": 7.568378919279519, + "loss": 0.4652, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "grad_norm": 1.1245099306106567, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "learning_rate": 0.00011776907284153793, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.4744243323802948, + "step": 22690 + }, + { + "ce_loss": 0.0841083750128746, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.19078223407268524, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.09597629308700562, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.4273073971271515, + "step": 22690 + }, + { + "ce_loss": 0.095107801258564, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.24718546867370605, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.08484120666980743, + "step": 22690 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.4627, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "grad_norm": 2.2187397480010986, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "learning_rate": 0.00011746301530792779, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.4897652268409729, + "step": 22700 + }, + { + "ce_loss": 0.11604621261358261, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.23150309920310974, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.11678127944469452, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.3536388874053955, + "step": 22700 + }, + { + "ce_loss": 0.0858709067106247, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.1904067099094391, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.07716457545757294, + "step": 22700 + }, + { + "epoch": 7.57505003335557, + "loss": 0.4752, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "grad_norm": 1.5376068353652954, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "learning_rate": 0.00011715728752538102, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.48741573095321655, + "step": 22710 + }, + { + "ce_loss": 0.09763926267623901, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.23203155398368835, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.09691423177719116, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.35795313119888306, + "step": 22710 + }, + { + "ce_loss": 0.07701003551483154, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.188336580991745, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.0710507407784462, + "step": 22710 + }, + { + "epoch": 7.578385590393596, + "loss": 0.4953, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "grad_norm": 1.9367997646331787, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "learning_rate": 0.00011685188985071485, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.6553285717964172, + "step": 22720 + }, + { + "ce_loss": 0.16441576182842255, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.23933960497379303, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.11505937576293945, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.3881013095378876, + "step": 22720 + }, + { + "ce_loss": 0.06341129541397095, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.2118532657623291, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.06373800337314606, + "step": 22720 + }, + { + "epoch": 7.581721147431621, + "loss": 0.4939, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "grad_norm": 1.2959163188934326, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "learning_rate": 0.0001165468226403612, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.5416849851608276, + "step": 22730 + }, + { + "ce_loss": 0.11458203941583633, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.2702295184135437, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.07892525941133499, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.35920917987823486, + "step": 22730 + }, + { + "ce_loss": 0.03701075538992882, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.16796259582042694, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.0796397402882576, + "step": 22730 + }, + { + "epoch": 7.585056704469647, + "loss": 0.4879, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "grad_norm": 1.5322014093399048, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "learning_rate": 0.0001162420862503665, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.45357197523117065, + "step": 22740 + }, + { + "ce_loss": 0.08511171489953995, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.2759219706058502, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.06644267588853836, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.49072521924972534, + "step": 22740 + }, + { + "ce_loss": 0.11758553236722946, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.2424834817647934, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.09420862793922424, + "step": 22740 + }, + { + "epoch": 7.588392261507671, + "loss": 0.4543, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "grad_norm": 1.0535856485366821, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "learning_rate": 0.00011593768103639062, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.5638757944107056, + "step": 22750 + }, + { + "ce_loss": 0.14501330256462097, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.2349630892276764, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.1257646083831787, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.3980574905872345, + "step": 22750 + }, + { + "ce_loss": 0.0741574838757515, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.22142386436462402, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.10224597156047821, + "step": 22750 + }, + { + "epoch": 7.591727818545698, + "loss": 0.4443, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "grad_norm": 1.2894861698150635, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "learning_rate": 0.00011563360735370733, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.4116136431694031, + "step": 22760 + }, + { + "ce_loss": 0.05981724336743355, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.21280869841575623, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.05637165158987045, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.40976908802986145, + "step": 22760 + }, + { + "ce_loss": 0.1144990473985672, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.20817773044109344, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.08667189627885818, + "step": 22760 + }, + { + "epoch": 7.595063375583722, + "loss": 0.4725, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "grad_norm": 1.313011646270752, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "learning_rate": 0.00011532986555720335, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.41087859869003296, + "step": 22770 + }, + { + "ce_loss": 0.10889429599046707, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.19500449299812317, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.08164864033460617, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.6618943214416504, + "step": 22770 + }, + { + "ce_loss": 0.11418968439102173, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.2610551714897156, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.11935710161924362, + "step": 22770 + }, + { + "epoch": 7.598398932621748, + "loss": 0.5124, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "grad_norm": 1.1959657669067383, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "learning_rate": 0.00011502645600137808, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.46471190452575684, + "step": 22780 + }, + { + "ce_loss": 0.07454466819763184, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.20361733436584473, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.09685507416725159, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.4594840109348297, + "step": 22780 + }, + { + "ce_loss": 0.09722603857517242, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.21554715931415558, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.12070967257022858, + "step": 22780 + }, + { + "epoch": 7.601734489659773, + "loss": 0.5519, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "grad_norm": 3.1931025981903076, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "learning_rate": 0.00011472337904034302, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.6544221639633179, + "step": 22790 + }, + { + "ce_loss": 0.1270163208246231, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.2164284735918045, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.08375734090805054, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.28890594840049744, + "step": 22790 + }, + { + "ce_loss": 0.03669268265366554, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.18156400322914124, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.04691380262374878, + "step": 22790 + }, + { + "epoch": 7.605070046697799, + "loss": 0.5062, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "grad_norm": 6.59785270690918, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "learning_rate": 0.00011442063502782167, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.7847855091094971, + "step": 22800 + }, + { + "ce_loss": 0.05502784252166748, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.17512984573841095, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.06240437552332878, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.33179566264152527, + "step": 22800 + }, + { + "ce_loss": 0.043317005038261414, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.14471793174743652, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.0970865786075592, + "step": 22800 + }, + { + "epoch": 7.608405603735823, + "loss": 0.4982, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "grad_norm": 1.2467219829559326, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "learning_rate": 0.00011411822431714902, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.4001379609107971, + "step": 22810 + }, + { + "ce_loss": 0.08985906839370728, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.1979110985994339, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.11203141510486603, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.4521886706352234, + "step": 22810 + }, + { + "ce_loss": 0.052460163831710815, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.18328292667865753, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.08354941755533218, + "step": 22810 + }, + { + "epoch": 7.61174116077385, + "loss": 0.5096, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "grad_norm": 1.4720590114593506, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "learning_rate": 0.00011381614726127057, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.5751574635505676, + "step": 22820 + }, + { + "ce_loss": 0.08937083929777145, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.23865702748298645, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.12281100451946259, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.8471077084541321, + "step": 22820 + }, + { + "ce_loss": 0.05814126506447792, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.1887391358613968, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.0871066004037857, + "step": 22820 + }, + { + "epoch": 7.615076717811874, + "loss": 0.5446, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "grad_norm": 1.2326645851135254, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "learning_rate": 0.00011351440421274296, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.8309054374694824, + "step": 22830 + }, + { + "ce_loss": 0.07843590527772903, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.2470468282699585, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.08887697756290436, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.4266068637371063, + "step": 22830 + }, + { + "ce_loss": 0.08766929805278778, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.19042043387889862, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.1035689115524292, + "step": 22830 + }, + { + "epoch": 7.6184122748499, + "loss": 0.5534, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "grad_norm": 4.262497901916504, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "learning_rate": 0.00011321299552373274, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.6729900240898132, + "step": 22840 + }, + { + "ce_loss": 0.14668934047222137, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.2730126678943634, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.09181664884090424, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.4466512203216553, + "step": 22840 + }, + { + "ce_loss": 0.09699872881174088, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.2412225306034088, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.10813838243484497, + "step": 22840 + }, + { + "epoch": 7.621747831887925, + "loss": 0.5271, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "grad_norm": 1.896168828010559, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "learning_rate": 0.00011291192154601642, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.45912012457847595, + "step": 22850 + }, + { + "ce_loss": 0.10149000585079193, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.20684868097305298, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.11774280667304993, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.3641864061355591, + "step": 22850 + }, + { + "ce_loss": 0.05166614428162575, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.1630660593509674, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.06195435672998428, + "step": 22850 + }, + { + "epoch": 7.625083388925951, + "loss": 0.4385, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "grad_norm": 1.0913865566253662, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "learning_rate": 0.00011261118263097952, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.4047800302505493, + "step": 22860 + }, + { + "ce_loss": 0.080494225025177, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.19724488258361816, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.10261885076761246, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.5060003399848938, + "step": 22860 + }, + { + "ce_loss": 0.12229954451322556, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.27040326595306396, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.08427592366933823, + "step": 22860 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.5, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "grad_norm": 1.7914592027664185, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "learning_rate": 0.00011231077912961678, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.3973727524280548, + "step": 22870 + }, + { + "ce_loss": 0.10274583846330643, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.1657116413116455, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.0870896428823471, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.7164053916931152, + "step": 22870 + }, + { + "ce_loss": 0.12490220367908478, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.2593473196029663, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.11110673099756241, + "step": 22870 + }, + { + "epoch": 7.631754503002002, + "loss": 0.4629, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "grad_norm": 1.3830268383026123, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "learning_rate": 0.00011201071139253132, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.42698851227760315, + "step": 22880 + }, + { + "ce_loss": 0.12712013721466064, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.18044383823871613, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.11906865984201431, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.36526960134506226, + "step": 22880 + }, + { + "ce_loss": 0.0703156515955925, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.17102131247520447, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.060693372040987015, + "step": 22880 + }, + { + "epoch": 7.635090060040026, + "loss": 0.4636, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "grad_norm": 1.891458511352539, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "learning_rate": 0.0001117109797699348, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.38846418261528015, + "step": 22890 + }, + { + "ce_loss": 0.09349704533815384, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.17836086452007294, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.08837179094552994, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.34821924567222595, + "step": 22890 + }, + { + "ce_loss": 0.07081709802150726, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.17022114992141724, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.08378297835588455, + "step": 22890 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.4368, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "grad_norm": 1.291176676750183, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "learning_rate": 0.000111411584611646, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.46497029066085815, + "step": 22900 + }, + { + "ce_loss": 0.08809613436460495, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.18776924908161163, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.07957534492015839, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.3915051221847534, + "step": 22900 + }, + { + "ce_loss": 0.07151425629854202, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.18844066560268402, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.0949799045920372, + "step": 22900 + }, + { + "epoch": 7.641761174116077, + "loss": 0.4634, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "grad_norm": 1.6681424379348755, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "learning_rate": 0.00011111252626709135, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.7183939814567566, + "step": 22910 + }, + { + "ce_loss": 0.1378229409456253, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.2599409818649292, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.10532394051551819, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.4198286831378937, + "step": 22910 + }, + { + "ce_loss": 0.1008639857172966, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.22020921111106873, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.09852571785449982, + "step": 22910 + }, + { + "epoch": 7.645096731154103, + "loss": 0.4998, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "grad_norm": 1.3718444108963013, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "learning_rate": 0.00011081380508530413, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.5433452129364014, + "step": 22920 + }, + { + "ce_loss": 0.0930216908454895, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.19634249806404114, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.09999752044677734, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.4787171185016632, + "step": 22920 + }, + { + "ce_loss": 0.12058284878730774, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.22844330966472626, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.10536279529333115, + "step": 22920 + }, + { + "epoch": 7.648432288192128, + "loss": 0.5512, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "grad_norm": 2.082670211791992, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "learning_rate": 0.00011051542141492422, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.5069497227668762, + "step": 22930 + }, + { + "ce_loss": 0.08902516216039658, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.16418182849884033, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.09682785719633102, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.5495210886001587, + "step": 22930 + }, + { + "ce_loss": 0.08924379199743271, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.14871746301651, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.07341916114091873, + "step": 22930 + }, + { + "epoch": 7.651767845230154, + "loss": 0.4669, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "grad_norm": 1.1963962316513062, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "learning_rate": 0.00011021737560419718, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.4135090708732605, + "step": 22940 + }, + { + "ce_loss": 0.05780329555273056, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.19066178798675537, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.08974800258874893, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.5159320831298828, + "step": 22940 + }, + { + "ce_loss": 0.07121335715055466, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.1589433252811432, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.07944811880588531, + "step": 22940 + }, + { + "epoch": 7.655103402268178, + "loss": 0.488, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "grad_norm": 1.3356716632843018, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "learning_rate": 0.00010991966800097473, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.9157035946846008, + "step": 22950 + }, + { + "ce_loss": 0.14918577671051025, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.3011043071746826, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.12013433128595352, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.5391525030136108, + "step": 22950 + }, + { + "ce_loss": 0.15401852130889893, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.27220389246940613, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.11245297640562057, + "step": 22950 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.5002, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "grad_norm": 2.074615478515625, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "learning_rate": 0.00010962229895271367, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.4456467628479004, + "step": 22960 + }, + { + "ce_loss": 0.07245982438325882, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.1963823288679123, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.08949127048254013, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.41796499490737915, + "step": 22960 + }, + { + "ce_loss": 0.06558441370725632, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.19469647109508514, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.08166325092315674, + "step": 22960 + }, + { + "epoch": 7.661774516344229, + "loss": 0.4619, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "grad_norm": 1.8865573406219482, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "learning_rate": 0.00010932526880647582, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.4492861032485962, + "step": 22970 + }, + { + "ce_loss": 0.0742424726486206, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.17198584973812103, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.07258053123950958, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.8455418348312378, + "step": 22970 + }, + { + "ce_loss": 0.14300350844860077, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.25651904940605164, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.0984906330704689, + "step": 22970 + }, + { + "epoch": 7.665110073382255, + "loss": 0.4998, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "grad_norm": 1.2946828603744507, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "learning_rate": 0.00010902857790892703, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.7474709749221802, + "step": 22980 + }, + { + "ce_loss": 0.1747969537973404, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.27910372614860535, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.11947666853666306, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.43477147817611694, + "step": 22980 + }, + { + "ce_loss": 0.10325319319963455, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.23486895859241486, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.09604207426309586, + "step": 22980 + }, + { + "epoch": 7.66844563042028, + "loss": 0.4873, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "grad_norm": 1.5909743309020996, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "learning_rate": 0.00010873222660633748, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.5011396408081055, + "step": 22990 + }, + { + "ce_loss": 0.1381172388792038, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.24536283314228058, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.11745288223028183, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.9448994398117065, + "step": 22990 + }, + { + "ce_loss": 0.09806367754936218, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.17582949995994568, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.08818396180868149, + "step": 22990 + }, + { + "epoch": 7.671781187458306, + "loss": 0.5106, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "grad_norm": 1.6053009033203125, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "learning_rate": 0.00010843621524458148, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.4494698643684387, + "step": 23000 + }, + { + "ce_loss": 0.10007108747959137, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.2064976692199707, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.10766438394784927, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.3754218518733978, + "step": 23000 + }, + { + "ce_loss": 0.07610825449228287, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.21002225577831268, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.08917049318552017, + "step": 23000 + }, + { + "epoch": 7.67511674449633, + "loss": 0.4974, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "grad_norm": 1.244104266166687, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "learning_rate": 0.0001081405441691357, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.43800780177116394, + "step": 23010 + }, + { + "ce_loss": 0.08042147010564804, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.18267808854579926, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.09389374405145645, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.5576515197753906, + "step": 23010 + }, + { + "ce_loss": 0.08384501934051514, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.21569280326366425, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.11242746561765671, + "step": 23010 + }, + { + "epoch": 7.678452301534357, + "loss": 0.5444, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "grad_norm": 1.104618787765503, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "learning_rate": 0.00010784521372508027, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.5867886543273926, + "step": 23020 + }, + { + "ce_loss": 0.11901320517063141, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.25423428416252136, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.12625397741794586, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.45270201563835144, + "step": 23020 + }, + { + "ce_loss": 0.09595979750156403, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.20388580858707428, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.09884056448936462, + "step": 23020 + }, + { + "epoch": 7.681787858572381, + "loss": 0.4848, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "grad_norm": 1.514387607574463, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "learning_rate": 0.00010755022425709755, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.5719349980354309, + "step": 23030 + }, + { + "ce_loss": 0.14947743713855743, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.2445412427186966, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.10241449624300003, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.8717924356460571, + "step": 23030 + }, + { + "ce_loss": 0.08390536159276962, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.23097392916679382, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.11411327123641968, + "step": 23030 + }, + { + "epoch": 7.685123415610407, + "loss": 0.5377, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "grad_norm": 1.4054793119430542, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "learning_rate": 0.00010725557610947214, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.6011195778846741, + "step": 23040 + }, + { + "ce_loss": 0.11097917705774307, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.2800613343715668, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.1031576469540596, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.4414733648300171, + "step": 23040 + }, + { + "ce_loss": 0.08354143053293228, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.18651583790779114, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.10314172506332397, + "step": 23040 + }, + { + "epoch": 7.688458972648432, + "loss": 0.494, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "grad_norm": 2.3914988040924072, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "learning_rate": 0.00010696126962608995, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.426511287689209, + "step": 23050 + }, + { + "ce_loss": 0.07741673290729523, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.24165627360343933, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.10722195357084274, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.6378778219223022, + "step": 23050 + }, + { + "ce_loss": 0.1301768571138382, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.2512298822402954, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.09250593930482864, + "step": 23050 + }, + { + "epoch": 7.691794529686458, + "loss": 0.5472, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "grad_norm": 2.4647703170776367, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "learning_rate": 0.00010666730515043832, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.4028063118457794, + "step": 23060 + }, + { + "ce_loss": 0.04906738921999931, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.17633600533008575, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.07943486422300339, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.32616400718688965, + "step": 23060 + }, + { + "ce_loss": 0.04622570052742958, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.16948921978473663, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.08727295696735382, + "step": 23060 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.4505, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "grad_norm": 1.6886954307556152, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "learning_rate": 0.00010637368302560551, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.4469156861305237, + "step": 23070 + }, + { + "ce_loss": 0.1259409338235855, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.20817193388938904, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.08971387892961502, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.605049192905426, + "step": 23070 + }, + { + "ce_loss": 0.08226354420185089, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.24582314491271973, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.10886520892381668, + "step": 23070 + }, + { + "epoch": 7.698465643762509, + "loss": 0.4939, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "grad_norm": 1.330239176750183, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "learning_rate": 0.00010608040359428008, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.47724735736846924, + "step": 23080 + }, + { + "ce_loss": 0.09845036268234253, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.22484542429447174, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.09992624819278717, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.3705708980560303, + "step": 23080 + }, + { + "ce_loss": 0.06413956731557846, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.20974738895893097, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.09660591930150986, + "step": 23080 + }, + { + "epoch": 7.701801200800533, + "loss": 0.4884, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "grad_norm": 1.2784122228622437, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "learning_rate": 0.00010578746719875087, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 0.3787055015563965, + "step": 23090 + }, + { + "ce_loss": 0.06965330243110657, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.20763544738292694, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.06809075176715851, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 1.2794193029403687, + "step": 23090 + }, + { + "ce_loss": 0.11845719069242477, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.2328185886144638, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.11717583239078522, + "step": 23090 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.5178, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "grad_norm": 1.4460859298706055, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "learning_rate": 0.00010549487418090578, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.30405837297439575, + "step": 23100 + }, + { + "ce_loss": 0.037146370857954025, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.19589708745479584, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.07080904394388199, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.8166431188583374, + "step": 23100 + }, + { + "ce_loss": 0.12483064830303192, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.2657544016838074, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.07705806940793991, + "step": 23100 + }, + { + "epoch": 7.708472314876584, + "loss": 0.5018, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "grad_norm": 1.9317492246627808, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "learning_rate": 0.0001052026248822327, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.579699695110321, + "step": 23110 + }, + { + "ce_loss": 0.06592089682817459, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.22586572170257568, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.11663997918367386, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.3650611639022827, + "step": 23110 + }, + { + "ce_loss": 0.049009211361408234, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.18839851021766663, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.08527813106775284, + "step": 23110 + }, + { + "epoch": 7.71180787191461, + "loss": 0.4888, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "grad_norm": 1.1595827341079712, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "learning_rate": 0.00010491071964381798, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.5201905965805054, + "step": 23120 + }, + { + "ce_loss": 0.17336910963058472, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.2682691514492035, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.07831382751464844, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.3259257972240448, + "step": 23120 + }, + { + "ce_loss": 0.03200289607048035, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.175248384475708, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.08027581870555878, + "step": 23120 + }, + { + "epoch": 7.715143428952635, + "loss": 0.4828, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "grad_norm": 1.531973958015442, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "learning_rate": 0.00010461915880634627, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.412576824426651, + "step": 23130 + }, + { + "ce_loss": 0.06026545166969299, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.21231496334075928, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.07956986874341965, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.41848087310791016, + "step": 23130 + }, + { + "ce_loss": 0.08960796147584915, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.22111055254936218, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.08399610966444016, + "step": 23130 + }, + { + "epoch": 7.718478985990661, + "loss": 0.5039, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "grad_norm": 1.5510739088058472, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "learning_rate": 0.00010432794271010049, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.40824779868125916, + "step": 23140 + }, + { + "ce_loss": 0.09309175610542297, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.19793115556240082, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.11696568131446838, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.4437388777732849, + "step": 23140 + }, + { + "ce_loss": 0.08572607487440109, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.2218407690525055, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.0954335629940033, + "step": 23140 + }, + { + "epoch": 7.721814543028685, + "loss": 0.481, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "grad_norm": 1.44486403465271, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "learning_rate": 0.00010403707169496124, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.4209980070590973, + "step": 23150 + }, + { + "ce_loss": 0.04794754460453987, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.20606538653373718, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.06347404420375824, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.46626389026641846, + "step": 23150 + }, + { + "ce_loss": 0.08045854419469833, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.21515415608882904, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.09117396175861359, + "step": 23150 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.5281, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "grad_norm": 1.3297337293624878, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "learning_rate": 0.00010374654610040636, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.4131792187690735, + "step": 23160 + }, + { + "ce_loss": 0.08460301160812378, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.16719715297222137, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.0832613930106163, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.5897133350372314, + "step": 23160 + }, + { + "ce_loss": 0.14093492925167084, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.2610013782978058, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.10729961097240448, + "step": 23160 + }, + { + "epoch": 7.728485657104736, + "loss": 0.5116, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "grad_norm": 1.3210251331329346, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "learning_rate": 0.00010345636626551022, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.641310453414917, + "step": 23170 + }, + { + "ce_loss": 0.06956622749567032, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.2333042323589325, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.1018952876329422, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.6080069541931152, + "step": 23170 + }, + { + "ce_loss": 0.15181094408035278, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.28187862038612366, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.09707515686750412, + "step": 23170 + }, + { + "epoch": 7.731821214142762, + "loss": 0.4553, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "grad_norm": 1.2260631322860718, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "learning_rate": 0.0001031665325289441, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.3943283259868622, + "step": 23180 + }, + { + "ce_loss": 0.08596097677946091, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.19753313064575195, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.08808321505784988, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.43322786688804626, + "step": 23180 + }, + { + "ce_loss": 0.12042468786239624, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.2224731743335724, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.069795623421669, + "step": 23180 + }, + { + "epoch": 7.735156771180787, + "loss": 0.4834, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "grad_norm": 2.1221816539764404, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "learning_rate": 0.00010287704522897512, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.5039787292480469, + "step": 23190 + }, + { + "ce_loss": 0.1576055884361267, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.24595049023628235, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.10024640709161758, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.7509039640426636, + "step": 23190 + }, + { + "ce_loss": 0.1318170428276062, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.2708563506603241, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.1185612827539444, + "step": 23190 + }, + { + "epoch": 7.738492328218813, + "loss": 0.4798, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "grad_norm": 1.3106443881988525, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "learning_rate": 0.00010258790470346622, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.5049909949302673, + "step": 23200 + }, + { + "ce_loss": 0.10278329998254776, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.2146678864955902, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.11462140828371048, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.6290625333786011, + "step": 23200 + }, + { + "ce_loss": 0.12055748701095581, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.19564548134803772, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.10627972334623337, + "step": 23200 + }, + { + "epoch": 7.741827885256837, + "loss": 0.4806, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "grad_norm": 1.6218887567520142, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "learning_rate": 0.00010229911128987515, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.5480747222900391, + "step": 23210 + }, + { + "ce_loss": 0.11701656132936478, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.2518675923347473, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.11369401961565018, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.39638179540634155, + "step": 23210 + }, + { + "ce_loss": 0.05360805243253708, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.15531179308891296, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.06214721128344536, + "step": 23210 + }, + { + "epoch": 7.745163442294864, + "loss": 0.4309, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "grad_norm": 1.4390681982040405, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "learning_rate": 0.00010201066532525528, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.3549351096153259, + "step": 23220 + }, + { + "ce_loss": 0.09753035753965378, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.18527477979660034, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.07191254943609238, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.3918623626232147, + "step": 23220 + }, + { + "ce_loss": 0.07541152834892273, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.1978979855775833, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.08795817196369171, + "step": 23220 + }, + { + "epoch": 7.748498999332888, + "loss": 0.4934, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "grad_norm": 1.57815420627594, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "learning_rate": 0.00010172256714625406, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.5031512379646301, + "step": 23230 + }, + { + "ce_loss": 0.10324139893054962, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.26005861163139343, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.08189309388399124, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.44763678312301636, + "step": 23230 + }, + { + "ce_loss": 0.10603315383195877, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.202081561088562, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.11808335036039352, + "step": 23230 + }, + { + "epoch": 7.751834556370914, + "loss": 0.5099, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "grad_norm": 0.9400928616523743, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "learning_rate": 0.00010143481708911285, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.2977118492126465, + "step": 23240 + }, + { + "ce_loss": 0.09156695008277893, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.12004655599594116, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.08571289479732513, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.7364984750747681, + "step": 23240 + }, + { + "ce_loss": 0.056263577193021774, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.226426362991333, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.0868932455778122, + "step": 23240 + }, + { + "epoch": 7.755170113408939, + "loss": 0.4672, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "grad_norm": 1.5790131092071533, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "learning_rate": 0.00010114741548966704, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.36995837092399597, + "step": 23250 + }, + { + "ce_loss": 0.0722150132060051, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.16372142732143402, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.08817657828330994, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.4690679907798767, + "step": 23250 + }, + { + "ce_loss": 0.08446140587329865, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.2306831032037735, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.07352418452501297, + "step": 23250 + }, + { + "epoch": 7.758505670446965, + "loss": 0.4484, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "grad_norm": 1.297808051109314, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "learning_rate": 0.00010086036268334522, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.477030485868454, + "step": 23260 + }, + { + "ce_loss": 0.098115473985672, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.23980093002319336, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.1157793179154396, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.4922773540019989, + "step": 23260 + }, + { + "ce_loss": 0.050856947898864746, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.2195744812488556, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.1344345360994339, + "step": 23260 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.4674, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "grad_norm": 1.9501700401306152, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "learning_rate": 0.0001005736590051689, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.2705991268157959, + "step": 23270 + }, + { + "ce_loss": 0.036175936460494995, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.16692174971103668, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.06731446087360382, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.5502997636795044, + "step": 23270 + }, + { + "ce_loss": 0.11611030250787735, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.19314511120319366, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.12072230875492096, + "step": 23270 + }, + { + "epoch": 7.765176784523016, + "loss": 0.5065, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "grad_norm": 2.136212110519409, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "learning_rate": 0.00010028730478975226, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.47727152705192566, + "step": 23280 + }, + { + "ce_loss": 0.11116799712181091, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.22672784328460693, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.09270457178354263, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.3988770842552185, + "step": 23280 + }, + { + "ce_loss": 0.09395449608564377, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.17199018597602844, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.09608203917741776, + "step": 23280 + }, + { + "epoch": 7.76851234156104, + "loss": 0.4953, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "grad_norm": 1.2677170038223267, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "learning_rate": 0.00010000130037130122, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.33433595299720764, + "step": 23290 + }, + { + "ce_loss": 0.072944276034832, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.17916573584079742, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.08193469047546387, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.3454050123691559, + "step": 23290 + }, + { + "ce_loss": 0.07415476441383362, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.1711643487215042, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.07147339731454849, + "step": 23290 + }, + { + "epoch": 7.771847898599066, + "loss": 0.494, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "grad_norm": 2.2848458290100098, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "learning_rate": 9.971564608361386e-05, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.7371745109558105, + "step": 23300 + }, + { + "ce_loss": 0.07996401935815811, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.23706546425819397, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.08906930685043335, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.6135530471801758, + "step": 23300 + }, + { + "ce_loss": 0.11487612128257751, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.2451004981994629, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.09790825098752975, + "step": 23300 + }, + { + "epoch": 7.775183455637091, + "loss": 0.4982, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "grad_norm": 1.1426531076431274, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "learning_rate": 9.943034226007944e-05, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.5762659311294556, + "step": 23310 + }, + { + "ce_loss": 0.07822701334953308, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.21036364138126373, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.09202571213245392, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.46803978085517883, + "step": 23310 + }, + { + "ce_loss": 0.08661315590143204, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.26759105920791626, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.08875215798616409, + "step": 23310 + }, + { + "epoch": 7.778519012675117, + "loss": 0.5067, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "grad_norm": 1.3050298690795898, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "learning_rate": 9.914538923367822e-05, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.31625789403915405, + "step": 23320 + }, + { + "ce_loss": 0.061495281755924225, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.16869650781154633, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.07117399573326111, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.4371589720249176, + "step": 23320 + }, + { + "ce_loss": 0.09910117834806442, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.20337212085723877, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.09639453142881393, + "step": 23320 + }, + { + "epoch": 7.781854569713142, + "loss": 0.4238, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "grad_norm": 1.3703293800354004, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "learning_rate": 9.886078733698108e-05, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.6084938645362854, + "step": 23330 + }, + { + "ce_loss": 0.09252669662237167, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.2027970552444458, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.0995938628911972, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.3872583508491516, + "step": 23330 + }, + { + "ce_loss": 0.10837969928979874, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.19923055171966553, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.07942364364862442, + "step": 23330 + }, + { + "epoch": 7.785190126751168, + "loss": 0.4826, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "grad_norm": 1.4044020175933838, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "learning_rate": 9.857653690214905e-05, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.6372712850570679, + "step": 23340 + }, + { + "ce_loss": 0.1294550895690918, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.289159893989563, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.10100501775741577, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.5620340704917908, + "step": 23340 + }, + { + "ce_loss": 0.15031994879245758, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.26740288734436035, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.11114541441202164, + "step": 23340 + }, + { + "epoch": 7.788525683789192, + "loss": 0.4902, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "grad_norm": 1.316171646118164, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "learning_rate": 9.829263826093305e-05, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.3973608911037445, + "step": 23350 + }, + { + "ce_loss": 0.055136047303676605, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.2086581289768219, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.06788954138755798, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.3197157084941864, + "step": 23350 + }, + { + "ce_loss": 0.0911700427532196, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.13513973355293274, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.07355178147554398, + "step": 23350 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.467, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "grad_norm": 1.2956640720367432, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "learning_rate": 9.800909174467317e-05, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.5860046148300171, + "step": 23360 + }, + { + "ce_loss": 0.08855269849300385, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.20145806670188904, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.07742108404636383, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.500344455242157, + "step": 23360 + }, + { + "ce_loss": 0.06911452859640121, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.24287210404872894, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.07664454728364944, + "step": 23360 + }, + { + "epoch": 7.795196797865243, + "loss": 0.4986, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "grad_norm": 2.773852586746216, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "learning_rate": 9.772589768429874e-05, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.49430906772613525, + "step": 23370 + }, + { + "ce_loss": 0.13668407499790192, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.249668151140213, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.08518452197313309, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.3225928843021393, + "step": 23370 + }, + { + "ce_loss": 0.06387481838464737, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.15631364285945892, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.07911847531795502, + "step": 23370 + }, + { + "epoch": 7.798532354903269, + "loss": 0.4476, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "grad_norm": 3.633497714996338, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "learning_rate": 9.744305641032778e-05, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.5017527937889099, + "step": 23380 + }, + { + "ce_loss": 0.10943292081356049, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.26812535524368286, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.0913664773106575, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.3241008222103119, + "step": 23380 + }, + { + "ce_loss": 0.0440908744931221, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.20425589382648468, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.05590224638581276, + "step": 23380 + }, + { + "epoch": 7.801867911941294, + "loss": 0.4976, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "grad_norm": 1.370785117149353, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "learning_rate": 9.71605682528666e-05, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.47232586145401, + "step": 23390 + }, + { + "ce_loss": 0.0808471217751503, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.26032713055610657, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.09770216792821884, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.5162336230278015, + "step": 23390 + }, + { + "ce_loss": 0.11250855028629303, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.22452497482299805, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.10824142396450043, + "step": 23390 + }, + { + "epoch": 7.80520346897932, + "loss": 0.5396, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "grad_norm": 1.8400545120239258, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "learning_rate": 9.687843354160904e-05, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.515917956829071, + "step": 23400 + }, + { + "ce_loss": 0.10233543813228607, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.19212068617343903, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.109577476978302, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.4446874260902405, + "step": 23400 + }, + { + "ce_loss": 0.07502096146345139, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.2301982194185257, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.08803768455982208, + "step": 23400 + }, + { + "epoch": 7.808539026017344, + "loss": 0.5094, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "grad_norm": 1.3168748617172241, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "learning_rate": 9.65966526058367e-05, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.366154283285141, + "step": 23410 + }, + { + "ce_loss": 0.06043058633804321, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.21017275750637054, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.06821354478597641, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.38487592339515686, + "step": 23410 + }, + { + "ce_loss": 0.08029428869485855, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.1981252282857895, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.07559670507907867, + "step": 23410 + }, + { + "epoch": 7.811874583055371, + "loss": 0.4743, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "grad_norm": 1.4600363969802856, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "learning_rate": 9.631522577441838e-05, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.6461279988288879, + "step": 23420 + }, + { + "ce_loss": 0.12234245985746384, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.25451117753982544, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.14721155166625977, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.4057418406009674, + "step": 23420 + }, + { + "ce_loss": 0.09284445643424988, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.217638298869133, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.0950545147061348, + "step": 23420 + }, + { + "epoch": 7.815210140093396, + "loss": 0.4539, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "grad_norm": 1.27256441116333, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "learning_rate": 9.603415337580939e-05, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.45491790771484375, + "step": 23430 + }, + { + "ce_loss": 0.08659575879573822, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.20614565908908844, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.09821710735559464, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.5650123357772827, + "step": 23430 + }, + { + "ce_loss": 0.08840961009263992, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.2519112825393677, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.09993168711662292, + "step": 23430 + }, + { + "epoch": 7.818545697131421, + "loss": 0.4891, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "grad_norm": 2.7551286220550537, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "learning_rate": 9.575343573805149e-05, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.5477972030639648, + "step": 23440 + }, + { + "ce_loss": 0.08709275722503662, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.22108495235443115, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.0938587412238121, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.28603890538215637, + "step": 23440 + }, + { + "ce_loss": 0.04387114942073822, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.16789186000823975, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.07395945489406586, + "step": 23440 + }, + { + "epoch": 7.821881254169447, + "loss": 0.4996, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "grad_norm": 2.3392765522003174, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "learning_rate": 9.547307318877234e-05, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.5413024425506592, + "step": 23450 + }, + { + "ce_loss": 0.11601690202951431, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.2168048620223999, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.11610511690378189, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.422008752822876, + "step": 23450 + }, + { + "ce_loss": 0.08798611164093018, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.23578450083732605, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.0685184970498085, + "step": 23450 + }, + { + "epoch": 7.825216811207472, + "loss": 0.4755, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "grad_norm": 2.4963676929473877, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "learning_rate": 9.519306605518527e-05, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.7461239695549011, + "step": 23460 + }, + { + "ce_loss": 0.10150115191936493, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.17962796986103058, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.10328700393438339, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.3461375832557678, + "step": 23460 + }, + { + "ce_loss": 0.07732126116752625, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.18031379580497742, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.07350736856460571, + "step": 23460 + }, + { + "epoch": 7.828552368245497, + "loss": 0.5119, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "grad_norm": 1.6425929069519043, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "learning_rate": 9.491341466408882e-05, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.44518062472343445, + "step": 23470 + }, + { + "ce_loss": 0.11213728785514832, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.21456876397132874, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.11830023676156998, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.4567699432373047, + "step": 23470 + }, + { + "ce_loss": 0.0971532016992569, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.22243893146514893, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.10531125962734222, + "step": 23470 + }, + { + "epoch": 7.831887925283523, + "loss": 0.4603, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "grad_norm": 1.9173786640167236, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "learning_rate": 9.463411934186601e-05, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.5281015038490295, + "step": 23480 + }, + { + "ce_loss": 0.09305088967084885, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.27902358770370483, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.07683122158050537, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.44092512130737305, + "step": 23480 + }, + { + "ce_loss": 0.05991402268409729, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.261231005191803, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.08075682073831558, + "step": 23480 + }, + { + "epoch": 7.835223482321548, + "loss": 0.4746, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "grad_norm": 1.47471284866333, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "learning_rate": 9.435518041448466e-05, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.3915292024612427, + "step": 23490 + }, + { + "ce_loss": 0.0853789672255516, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.16837775707244873, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.09448844194412231, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.43984946608543396, + "step": 23490 + }, + { + "ce_loss": 0.08536282926797867, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.207362562417984, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.11695067584514618, + "step": 23490 + }, + { + "epoch": 7.838559039359573, + "loss": 0.4805, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "grad_norm": 1.0447158813476562, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "learning_rate": 9.407659820749648e-05, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.5188337564468384, + "step": 23500 + }, + { + "ce_loss": 0.06215955317020416, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.18601679801940918, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.08605736494064331, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.4018697142601013, + "step": 23500 + }, + { + "ce_loss": 0.09234922379255295, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.201063871383667, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.07423745840787888, + "step": 23500 + }, + { + "epoch": 7.841894596397599, + "loss": 0.447, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "grad_norm": 1.3518065214157104, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "learning_rate": 9.379837304603705e-05, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.3417120575904846, + "step": 23510 + }, + { + "ce_loss": 0.07121224701404572, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.1719876229763031, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.062478888779878616, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.45312342047691345, + "step": 23510 + }, + { + "ce_loss": 0.11944121867418289, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.23101180791854858, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.1022668406367302, + "step": 23510 + }, + { + "epoch": 7.845230153435624, + "loss": 0.4267, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "grad_norm": 1.7903634309768677, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "learning_rate": 9.352050525482478e-05, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.3670975863933563, + "step": 23520 + }, + { + "ce_loss": 0.08908902108669281, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.17360833287239075, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.08187222480773926, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.42096009850502014, + "step": 23520 + }, + { + "ce_loss": 0.08967997133731842, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.20975546538829803, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.09015170484781265, + "step": 23520 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.4412, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "grad_norm": 1.3786771297454834, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "learning_rate": 9.324299515816148e-05, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.4102479815483093, + "step": 23530 + }, + { + "ce_loss": 0.046972282230854034, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.22596120834350586, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.09007242321968079, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.6155588030815125, + "step": 23530 + }, + { + "ce_loss": 0.0814739242196083, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.2271156758069992, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.09563101828098297, + "step": 23530 + }, + { + "epoch": 7.851901267511675, + "loss": 0.5099, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "grad_norm": 1.020455241203308, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "learning_rate": 9.296584307993125e-05, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.6030544638633728, + "step": 23540 + }, + { + "ce_loss": 0.1653348058462143, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.2651827335357666, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.11329934746026993, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.6119688153266907, + "step": 23540 + }, + { + "ce_loss": 0.13465023040771484, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.22328448295593262, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.1069074496626854, + "step": 23540 + }, + { + "epoch": 7.8552368245497, + "loss": 0.5109, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "grad_norm": 1.1971163749694824, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "learning_rate": 9.268904934360039e-05, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.4943358600139618, + "step": 23550 + }, + { + "ce_loss": 0.14528106153011322, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.21824562549591064, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.0931699350476265, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.43538621068000793, + "step": 23550 + }, + { + "ce_loss": 0.07891083508729935, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.23238444328308105, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.0957845076918602, + "step": 23550 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.4925, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "grad_norm": 1.2838505506515503, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "learning_rate": 9.241261427221695e-05, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.37081849575042725, + "step": 23560 + }, + { + "ce_loss": 0.06106231361627579, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.17535613477230072, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.07697635889053345, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.43605366349220276, + "step": 23560 + }, + { + "ce_loss": 0.08120652288198471, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.20109125971794128, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.11010712385177612, + "step": 23560 + }, + { + "epoch": 7.861907938625751, + "loss": 0.4525, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "grad_norm": 2.322266101837158, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "learning_rate": 9.213653818841046e-05, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.6685158014297485, + "step": 23570 + }, + { + "ce_loss": 0.11310289800167084, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.24341696500778198, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.1057773232460022, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.48814332485198975, + "step": 23570 + }, + { + "ce_loss": 0.08805195987224579, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.2128458321094513, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.09854722023010254, + "step": 23570 + }, + { + "epoch": 7.865243495663776, + "loss": 0.5274, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "grad_norm": 2.334955930709839, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "learning_rate": 9.186082141439145e-05, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.43076473474502563, + "step": 23580 + }, + { + "ce_loss": 0.10165365785360336, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.2332915961742401, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.09554749727249146, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.41907650232315063, + "step": 23580 + }, + { + "ce_loss": 0.0959557369351387, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.23277854919433594, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.08993334323167801, + "step": 23580 + }, + { + "epoch": 7.868579052701802, + "loss": 0.4974, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "grad_norm": 1.3074328899383545, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "learning_rate": 9.158546427195092e-05, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.5405129194259644, + "step": 23590 + }, + { + "ce_loss": 0.10693372040987015, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.21220289170742035, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.11354639381170273, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.3445146083831787, + "step": 23590 + }, + { + "ce_loss": 0.09031592309474945, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.16584084928035736, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.08803331106901169, + "step": 23590 + }, + { + "epoch": 7.871914609739827, + "loss": 0.4623, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "grad_norm": 1.153918743133545, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "learning_rate": 9.131046708246036e-05, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.3527657985687256, + "step": 23600 + }, + { + "ce_loss": 0.07177247107028961, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.19940011203289032, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.08104899525642395, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.37144240736961365, + "step": 23600 + }, + { + "ce_loss": 0.08155050873756409, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.17217794060707092, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.08980002254247665, + "step": 23600 + }, + { + "epoch": 7.875250166777852, + "loss": 0.5094, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "grad_norm": 2.631946086883545, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "learning_rate": 9.103583016687105e-05, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.4280019700527191, + "step": 23610 + }, + { + "ce_loss": 0.07770437747240067, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.22484853863716125, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.059737708419561386, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.37178662419319153, + "step": 23610 + }, + { + "ce_loss": 0.0942569151520729, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.1954062432050705, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.08182733505964279, + "step": 23610 + }, + { + "epoch": 7.878585723815878, + "loss": 0.4368, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "grad_norm": 0.9139115810394287, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "learning_rate": 9.076155384571387e-05, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.6207575798034668, + "step": 23620 + }, + { + "ce_loss": 0.09600795805454254, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.264886736869812, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.09426525235176086, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.4685150384902954, + "step": 23620 + }, + { + "ce_loss": 0.12142255157232285, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.2318248301744461, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.08457332104444504, + "step": 23620 + }, + { + "epoch": 7.881921280853903, + "loss": 0.5001, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "grad_norm": 1.480294942855835, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "learning_rate": 9.048763843909891e-05, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.5095387101173401, + "step": 23630 + }, + { + "ce_loss": 0.11818449944257736, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.24988248944282532, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.09514731168746948, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.3793356418609619, + "step": 23630 + }, + { + "ce_loss": 0.07153532654047012, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.19732148945331573, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.06991773098707199, + "step": 23630 + }, + { + "epoch": 7.885256837891928, + "loss": 0.5244, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "grad_norm": 2.3088581562042236, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "learning_rate": 9.021408426671469e-05, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.46456190943717957, + "step": 23640 + }, + { + "ce_loss": 0.08094074577093124, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.2792062759399414, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.06169167906045914, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.5832566618919373, + "step": 23640 + }, + { + "ce_loss": 0.1753084659576416, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.2629208266735077, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.1448318362236023, + "step": 23640 + }, + { + "epoch": 7.888592394929954, + "loss": 0.5023, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "grad_norm": 1.5563539266586304, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "learning_rate": 8.994089164782838e-05, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.37901008129119873, + "step": 23650 + }, + { + "ce_loss": 0.06203204765915871, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.17749962210655212, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.10638806968927383, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.39251482486724854, + "step": 23650 + }, + { + "ce_loss": 0.06868699938058853, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.1591005176305771, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.08333493769168854, + "step": 23650 + }, + { + "epoch": 7.891927951967979, + "loss": 0.4642, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "grad_norm": 1.2899342775344849, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "learning_rate": 8.966806090128543e-05, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.3885990083217621, + "step": 23660 + }, + { + "ce_loss": 0.06434261053800583, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.21467149257659912, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.07244598120450974, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.5642814636230469, + "step": 23660 + }, + { + "ce_loss": 0.11021937429904938, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.22649425268173218, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.10326132923364639, + "step": 23660 + }, + { + "epoch": 7.895263509006004, + "loss": 0.438, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "grad_norm": 1.264271855354309, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "learning_rate": 8.939559234550845e-05, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.43615591526031494, + "step": 23670 + }, + { + "ce_loss": 0.07874233275651932, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.1742517352104187, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.09160982072353363, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.5795973539352417, + "step": 23670 + }, + { + "ce_loss": 0.10307463258504868, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.2566601037979126, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.08958743512630463, + "step": 23670 + }, + { + "epoch": 7.89859906604403, + "loss": 0.4488, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "grad_norm": 1.1816338300704956, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "learning_rate": 8.912348629849759e-05, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.44713523983955383, + "step": 23680 + }, + { + "ce_loss": 0.04218914732336998, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.1715305745601654, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.07068831473588943, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.3945850133895874, + "step": 23680 + }, + { + "ce_loss": 0.05503353103995323, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.20405761897563934, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.09490940719842911, + "step": 23680 + }, + { + "epoch": 7.901934623082055, + "loss": 0.4506, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "grad_norm": 1.0902290344238281, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "learning_rate": 8.88517430778299e-05, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.4979480504989624, + "step": 23690 + }, + { + "ce_loss": 0.07646001875400543, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.20225203037261963, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.102251335978508, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.4533572793006897, + "step": 23690 + }, + { + "ce_loss": 0.080350361764431, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.1974259465932846, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.08958312124013901, + "step": 23690 + }, + { + "epoch": 7.90527018012008, + "loss": 0.4877, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "grad_norm": 9.525444984436035, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "learning_rate": 8.858036300065911e-05, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.37766262888908386, + "step": 23700 + }, + { + "ce_loss": 0.08095361292362213, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.15454107522964478, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.10152625292539597, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.38847488164901733, + "step": 23700 + }, + { + "ce_loss": 0.08247093111276627, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.1659182459115982, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.108720563352108, + "step": 23700 + }, + { + "epoch": 7.908605737158106, + "loss": 0.4464, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "grad_norm": 1.6320754289627075, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "learning_rate": 8.830934638371476e-05, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.40211835503578186, + "step": 23710 + }, + { + "ce_loss": 0.10799682885408401, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.19310733675956726, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.10020504891872406, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.32844552397727966, + "step": 23710 + }, + { + "ce_loss": 0.0812181755900383, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.1739908903837204, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.07298498600721359, + "step": 23710 + }, + { + "epoch": 7.911941294196131, + "loss": 0.459, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "grad_norm": 1.5958127975463867, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "learning_rate": 8.80386935433024e-05, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.4221686124801636, + "step": 23720 + }, + { + "ce_loss": 0.08290351927280426, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.1929956078529358, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.11231111735105515, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.6971574425697327, + "step": 23720 + }, + { + "ce_loss": 0.14547531306743622, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.2727227210998535, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.09017284214496613, + "step": 23720 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.4447, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "grad_norm": 1.317276120185852, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "learning_rate": 8.776840479530317e-05, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.5677560567855835, + "step": 23730 + }, + { + "ce_loss": 0.14442239701747894, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.22917179763317108, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.10563664138317108, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.34828436374664307, + "step": 23730 + }, + { + "ce_loss": 0.053382810205221176, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.19687311351299286, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.06002996861934662, + "step": 23730 + }, + { + "epoch": 7.918612408272182, + "loss": 0.4476, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "grad_norm": 3.7101621627807617, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "learning_rate": 8.749848045517315e-05, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.3004143238067627, + "step": 23740 + }, + { + "ce_loss": 0.04413343220949173, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.14866234362125397, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.1073392927646637, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.581344723701477, + "step": 23740 + }, + { + "ce_loss": 0.16669215261936188, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.2569911479949951, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.08630262315273285, + "step": 23740 + }, + { + "epoch": 7.921947965310207, + "loss": 0.4464, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "grad_norm": 1.4706305265426636, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "learning_rate": 8.722892083794287e-05, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.5540456771850586, + "step": 23750 + }, + { + "ce_loss": 0.12081640213727951, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.2431809902191162, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.11178777366876602, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.5178357362747192, + "step": 23750 + }, + { + "ce_loss": 0.08638735115528107, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.21622644364833832, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.08753465861082077, + "step": 23750 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.4746, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "grad_norm": 1.4268579483032227, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "learning_rate": 8.695972625821744e-05, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.5005850791931152, + "step": 23760 + }, + { + "ce_loss": 0.11384665220975876, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.20936432480812073, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.08197855949401855, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.4651836156845093, + "step": 23760 + }, + { + "ce_loss": 0.05032093822956085, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.24174442887306213, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.08633485436439514, + "step": 23760 + }, + { + "epoch": 7.928619079386258, + "loss": 0.4899, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "grad_norm": 1.0803759098052979, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "learning_rate": 8.669089703017608e-05, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.3728218078613281, + "step": 23770 + }, + { + "ce_loss": 0.07326167076826096, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.19154828786849976, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.0846903920173645, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.38450494408607483, + "step": 23770 + }, + { + "ce_loss": 0.10573725402355194, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.18517494201660156, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.09331288933753967, + "step": 23770 + }, + { + "epoch": 7.931954636424283, + "loss": 0.476, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "grad_norm": 1.3623288869857788, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "learning_rate": 8.642243346757144e-05, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.33736154437065125, + "step": 23780 + }, + { + "ce_loss": 0.0692714974284172, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.17541882395744324, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.07102624326944351, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.43745487928390503, + "step": 23780 + }, + { + "ce_loss": 0.08982948958873749, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.22856579720973969, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.08436769992113113, + "step": 23780 + }, + { + "epoch": 7.935290193462309, + "loss": 0.4873, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "grad_norm": 0.9369721412658691, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "learning_rate": 8.615433588372921e-05, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.4392630457878113, + "step": 23790 + }, + { + "ce_loss": 0.07594869285821915, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.22134481370449066, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.11620956659317017, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.35754644870758057, + "step": 23790 + }, + { + "ce_loss": 0.055565547198057175, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.18978551030158997, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.07662378996610641, + "step": 23790 + }, + { + "epoch": 7.938625750500334, + "loss": 0.4711, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "grad_norm": 1.3382545709609985, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "learning_rate": 8.588660459154821e-05, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.6800578236579895, + "step": 23800 + }, + { + "ce_loss": 0.11877725273370743, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.25581854581832886, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.1141149178147316, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.4789075255393982, + "step": 23800 + }, + { + "ce_loss": 0.06832636147737503, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.2092619389295578, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.070604108273983, + "step": 23800 + }, + { + "epoch": 7.941961307538359, + "loss": 0.4898, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "grad_norm": 1.1001218557357788, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "learning_rate": 8.561923990349962e-05, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.34215047955513, + "step": 23810 + }, + { + "ce_loss": 0.06366624683141708, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.13738380372524261, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.06837518513202667, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.4595257043838501, + "step": 23810 + }, + { + "ce_loss": 0.12964025139808655, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.1967119425535202, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.10415495932102203, + "step": 23810 + }, + { + "epoch": 7.945296864576385, + "loss": 0.4967, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "grad_norm": 1.7074393033981323, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "learning_rate": 8.535224213162694e-05, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.7736154198646545, + "step": 23820 + }, + { + "ce_loss": 0.12521244585514069, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.2687603831291199, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.0855402946472168, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.32968664169311523, + "step": 23820 + }, + { + "ce_loss": 0.047487348318099976, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.19186483323574066, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.09002339094877243, + "step": 23820 + }, + { + "epoch": 7.94863242161441, + "loss": 0.524, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "grad_norm": 1.674178957939148, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "learning_rate": 8.508561158754508e-05, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.5029487609863281, + "step": 23830 + }, + { + "ce_loss": 0.10470442473888397, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.3090134561061859, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.08878245949745178, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.31632712483406067, + "step": 23830 + }, + { + "ce_loss": 0.07198981195688248, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.1398410201072693, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.05910256877541542, + "step": 23830 + }, + { + "epoch": 7.951967978652435, + "loss": 0.4389, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "grad_norm": 1.578067421913147, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "learning_rate": 8.481934858244072e-05, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.3850739002227783, + "step": 23840 + }, + { + "ce_loss": 0.09572809189558029, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.2023659646511078, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.06878726184368134, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.33684679865837097, + "step": 23840 + }, + { + "ce_loss": 0.05011402815580368, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.17439530789852142, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.08841081708669662, + "step": 23840 + }, + { + "epoch": 7.955303535690461, + "loss": 0.4765, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "grad_norm": 1.7793841361999512, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "learning_rate": 8.455345342707138e-05, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.46160686016082764, + "step": 23850 + }, + { + "ce_loss": 0.07929213345050812, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.22592313587665558, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.10625654458999634, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.41478806734085083, + "step": 23850 + }, + { + "ce_loss": 0.0628419816493988, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.2424730658531189, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.0736921951174736, + "step": 23850 + }, + { + "epoch": 7.958639092728486, + "loss": 0.4448, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "grad_norm": 1.1721047163009644, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "learning_rate": 8.428792643176544e-05, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.31136319041252136, + "step": 23860 + }, + { + "ce_loss": 0.05320083349943161, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.16525880992412567, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.0927494540810585, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.40695562958717346, + "step": 23860 + }, + { + "ce_loss": 0.058694299310445786, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.23138663172721863, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.09014415740966797, + "step": 23860 + }, + { + "epoch": 7.961974649766511, + "loss": 0.4914, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "grad_norm": 1.4769777059555054, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "learning_rate": 8.402276790642117e-05, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.560581624507904, + "step": 23870 + }, + { + "ce_loss": 0.07228103280067444, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.21539956331253052, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.1249655932188034, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.44131043553352356, + "step": 23870 + }, + { + "ce_loss": 0.08430244773626328, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.21277786791324615, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.08507349342107773, + "step": 23870 + }, + { + "epoch": 7.965310206804537, + "loss": 0.4711, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "grad_norm": 1.1926960945129395, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "learning_rate": 8.375797816050743e-05, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.762226939201355, + "step": 23880 + }, + { + "ce_loss": 0.11749958992004395, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.2878839373588562, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.13919147849082947, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.4957149922847748, + "step": 23880 + }, + { + "ce_loss": 0.09586787223815918, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.2319086343050003, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.1141003966331482, + "step": 23880 + }, + { + "epoch": 7.968645763842562, + "loss": 0.4884, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "grad_norm": 2.1842336654663086, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "learning_rate": 8.349355750306233e-05, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.4850006401538849, + "step": 23890 + }, + { + "ce_loss": 0.1268412321805954, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.24083614349365234, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.0900849848985672, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.41669759154319763, + "step": 23890 + }, + { + "ce_loss": 0.07890409231185913, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.20932383835315704, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.10311402380466461, + "step": 23890 + }, + { + "epoch": 7.971981320880587, + "loss": 0.4596, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "grad_norm": 1.0587044954299927, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "learning_rate": 8.322950624269301e-05, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.4549918472766876, + "step": 23900 + }, + { + "ce_loss": 0.08084800839424133, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.18602995574474335, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.09495816379785538, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.43853050470352173, + "step": 23900 + }, + { + "ce_loss": 0.08710458874702454, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.22391125559806824, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.10002166777849197, + "step": 23900 + }, + { + "epoch": 7.975316877918613, + "loss": 0.4565, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "grad_norm": 0.9432169198989868, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "learning_rate": 8.296582468757583e-05, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.22017309069633484, + "step": 23910 + }, + { + "ce_loss": 0.0369831807911396, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.10918110609054565, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.0737534835934639, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.4532097280025482, + "step": 23910 + }, + { + "ce_loss": 0.10423537343740463, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.18550725281238556, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.08551128953695297, + "step": 23910 + }, + { + "epoch": 7.978652434956638, + "loss": 0.4653, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "grad_norm": 1.5507874488830566, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "learning_rate": 8.270251314545557e-05, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.4448733329772949, + "step": 23920 + }, + { + "ce_loss": 0.07775755971670151, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.18209832906723022, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.08332362771034241, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.3152199387550354, + "step": 23920 + }, + { + "ce_loss": 0.049137767404317856, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.16559302806854248, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.06261259317398071, + "step": 23920 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.5149, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "grad_norm": 1.3906818628311157, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "learning_rate": 8.243957192364514e-05, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.4795082211494446, + "step": 23930 + }, + { + "ce_loss": 0.07854557782411575, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.19227755069732666, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.12034870684146881, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.3461790978908539, + "step": 23930 + }, + { + "ce_loss": 0.08637548983097076, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.1667971909046173, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.09276289492845535, + "step": 23930 + }, + { + "epoch": 7.985323549032689, + "loss": 0.5295, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "grad_norm": 1.211516261100769, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "learning_rate": 8.21770013290251e-05, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.41299039125442505, + "step": 23940 + }, + { + "ce_loss": 0.10126766562461853, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.2065594643354416, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.06785832345485687, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.572719395160675, + "step": 23940 + }, + { + "ce_loss": 0.07103711366653442, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.19799089431762695, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.07989655435085297, + "step": 23940 + }, + { + "epoch": 7.988659106070714, + "loss": 0.4702, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "grad_norm": 1.3315993547439575, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "learning_rate": 8.191480166804368e-05, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.639316976070404, + "step": 23950 + }, + { + "ce_loss": 0.14909887313842773, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.2654898762702942, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.1123674139380455, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.44099435210227966, + "step": 23950 + }, + { + "ce_loss": 0.09333381056785583, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.198538139462471, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.10574786365032196, + "step": 23950 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.5087, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "grad_norm": 1.3975638151168823, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "learning_rate": 8.165297324671608e-05, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.37281206250190735, + "step": 23960 + }, + { + "ce_loss": 0.0822332426905632, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.16164526343345642, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.08455748856067657, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.5337703227996826, + "step": 23960 + }, + { + "ce_loss": 0.11471060663461685, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.2362850159406662, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.08897367119789124, + "step": 23960 + }, + { + "epoch": 7.995330220146765, + "loss": 0.4487, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "grad_norm": 1.1574140787124634, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "learning_rate": 8.13915163706243e-05, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.45330312848091125, + "step": 23970 + }, + { + "ce_loss": 0.08726263791322708, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.2119605392217636, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.11371081322431564, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.501648485660553, + "step": 23970 + }, + { + "ce_loss": 0.14696897566318512, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.21774685382843018, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.11590823531150818, + "step": 23970 + }, + { + "epoch": 7.99866577718479, + "loss": 0.5028, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "grad_norm": 1.1483594179153442, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "learning_rate": 8.113043134491656e-05, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.8916773200035095, + "step": 23980 + }, + { + "ce_loss": 0.08570877462625504, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.2253543585538864, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.10541833192110062, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.3779972195625305, + "step": 23980 + }, + { + "ce_loss": 0.0662795677781105, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.21109434962272644, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.06937018781900406, + "step": 23980 + }, + { + "epoch": 8.002001334222815, + "loss": 0.4566, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "grad_norm": 1.309220552444458, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "learning_rate": 8.086971847430728e-05, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "loss": 0.5047003626823425, + "step": 23990 + }, + { + "ce_loss": 0.09218957275152206, + "epoch": 8.002001334222815, + "step": 23990 + }, + { + "distill_loss": 0.2730647325515747, + "epoch": 8.002001334222815, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "ref_ce_loss": 0.07127413153648376, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "loss": 0.4207586944103241, + "step": 23990 + }, + { + "ce_loss": 0.08874835073947906, + "epoch": 8.002001334222815, + "step": 23990 + }, + { + "distill_loss": 0.22054798901081085, + "epoch": 8.002001334222815, + "step": 23990 + }, + { + "epoch": 8.002001334222815, + "ref_ce_loss": 0.07755181193351746, + "step": 23990 + }, + { + "epoch": 8.005336891260841, + "loss": 0.4223, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "grad_norm": 1.0361367464065552, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "learning_rate": 8.060937806307633e-05, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "loss": 0.3582214117050171, + "step": 24000 + }, + { + "ce_loss": 0.059342242777347565, + "epoch": 8.005336891260841, + "step": 24000 + }, + { + "distill_loss": 0.2024124413728714, + "epoch": 8.005336891260841, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "ref_ce_loss": 0.05704175680875778, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "loss": 0.4356797933578491, + "step": 24000 + }, + { + "ce_loss": 0.08584046363830566, + "epoch": 8.005336891260841, + "step": 24000 + }, + { + "distill_loss": 0.19962406158447266, + "epoch": 8.005336891260841, + "step": 24000 + }, + { + "epoch": 8.005336891260841, + "ref_ce_loss": 0.08645645529031754, + "step": 24000 + }, + { + "epoch": 8.008672448298865, + "loss": 0.4352, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "grad_norm": 1.4843693971633911, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "learning_rate": 8.034941041506918e-05, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "loss": 0.4060533940792084, + "step": 24010 + }, + { + "ce_loss": 0.07756105810403824, + "epoch": 8.008672448298865, + "step": 24010 + }, + { + "distill_loss": 0.20981661975383759, + "epoch": 8.008672448298865, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "ref_ce_loss": 0.0870758667588234, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "loss": 0.9769420027732849, + "step": 24010 + }, + { + "ce_loss": 0.052198830991983414, + "epoch": 8.008672448298865, + "step": 24010 + }, + { + "distill_loss": 0.25216665863990784, + "epoch": 8.008672448298865, + "step": 24010 + }, + { + "epoch": 8.008672448298865, + "ref_ce_loss": 0.06700126826763153, + "step": 24010 + }, + { + "epoch": 8.012008005336892, + "loss": 0.4767, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "grad_norm": 1.925126314163208, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "learning_rate": 8.008981583369575e-05, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "loss": 0.3252241313457489, + "step": 24020 + }, + { + "ce_loss": 0.06078958138823509, + "epoch": 8.012008005336892, + "step": 24020 + }, + { + "distill_loss": 0.1793786734342575, + "epoch": 8.012008005336892, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "ref_ce_loss": 0.08483623713254929, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "loss": 0.5393207669258118, + "step": 24020 + }, + { + "ce_loss": 0.05587855726480484, + "epoch": 8.012008005336892, + "step": 24020 + }, + { + "distill_loss": 0.22161366045475006, + "epoch": 8.012008005336892, + "step": 24020 + }, + { + "epoch": 8.012008005336892, + "ref_ce_loss": 0.07822628319263458, + "step": 24020 + }, + { + "epoch": 8.015343562374916, + "loss": 0.4434, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "grad_norm": 1.1432065963745117, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "learning_rate": 7.983059462193105e-05, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "loss": 0.8035485744476318, + "step": 24030 + }, + { + "ce_loss": 0.0552186444401741, + "epoch": 8.015343562374916, + "step": 24030 + }, + { + "distill_loss": 0.22818496823310852, + "epoch": 8.015343562374916, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "ref_ce_loss": 0.08648039400577545, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "loss": 0.5100343823432922, + "step": 24030 + }, + { + "ce_loss": 0.05726398900151253, + "epoch": 8.015343562374916, + "step": 24030 + }, + { + "distill_loss": 0.2203618735074997, + "epoch": 8.015343562374916, + "step": 24030 + }, + { + "epoch": 8.015343562374916, + "ref_ce_loss": 0.07337582111358643, + "step": 24030 + }, + { + "epoch": 8.018679119412942, + "loss": 0.4535, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "grad_norm": 1.6379269361495972, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "learning_rate": 7.957174708231404e-05, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "loss": 0.5619211196899414, + "step": 24040 + }, + { + "ce_loss": 0.06277598440647125, + "epoch": 8.018679119412942, + "step": 24040 + }, + { + "distill_loss": 0.21573035418987274, + "epoch": 8.018679119412942, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "ref_ce_loss": 0.0782727375626564, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "loss": 0.3220251202583313, + "step": 24040 + }, + { + "ce_loss": 0.04112187772989273, + "epoch": 8.018679119412942, + "step": 24040 + }, + { + "distill_loss": 0.18421748280525208, + "epoch": 8.018679119412942, + "step": 24040 + }, + { + "epoch": 8.018679119412942, + "ref_ce_loss": 0.0625697672367096, + "step": 24040 + }, + { + "epoch": 8.022014676450967, + "loss": 0.3997, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "grad_norm": 1.0753532648086548, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "learning_rate": 7.931327351694781e-05, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "loss": 0.4898815453052521, + "step": 24050 + }, + { + "ce_loss": 0.06757506728172302, + "epoch": 8.022014676450967, + "step": 24050 + }, + { + "distill_loss": 0.24555155634880066, + "epoch": 8.022014676450967, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "ref_ce_loss": 0.05489666759967804, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "loss": 0.3805431127548218, + "step": 24050 + }, + { + "ce_loss": 0.042916931211948395, + "epoch": 8.022014676450967, + "step": 24050 + }, + { + "distill_loss": 0.18931376934051514, + "epoch": 8.022014676450967, + "step": 24050 + }, + { + "epoch": 8.022014676450967, + "ref_ce_loss": 0.060156628489494324, + "step": 24050 + }, + { + "epoch": 8.025350233488993, + "loss": 0.397, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "grad_norm": 1.094015121459961, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "learning_rate": 7.905517422749862e-05, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "loss": 0.31888458132743835, + "step": 24060 + }, + { + "ce_loss": 0.024669796228408813, + "epoch": 8.025350233488993, + "step": 24060 + }, + { + "distill_loss": 0.20169229805469513, + "epoch": 8.025350233488993, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "ref_ce_loss": 0.06511060893535614, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "loss": 0.24008479714393616, + "step": 24060 + }, + { + "ce_loss": 0.03973004221916199, + "epoch": 8.025350233488993, + "step": 24060 + }, + { + "distill_loss": 0.13114789128303528, + "epoch": 8.025350233488993, + "step": 24060 + }, + { + "epoch": 8.025350233488993, + "ref_ce_loss": 0.06896114349365234, + "step": 24060 + }, + { + "epoch": 8.028685790527017, + "loss": 0.4315, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "grad_norm": 1.3001312017440796, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "learning_rate": 7.879744951519618e-05, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "loss": 0.4929823577404022, + "step": 24070 + }, + { + "ce_loss": 0.09074297547340393, + "epoch": 8.028685790527017, + "step": 24070 + }, + { + "distill_loss": 0.21786439418792725, + "epoch": 8.028685790527017, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "ref_ce_loss": 0.11397624760866165, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "loss": 0.48569536209106445, + "step": 24070 + }, + { + "ce_loss": 0.06525580585002899, + "epoch": 8.028685790527017, + "step": 24070 + }, + { + "distill_loss": 0.20845040678977966, + "epoch": 8.028685790527017, + "step": 24070 + }, + { + "epoch": 8.028685790527017, + "ref_ce_loss": 0.06835462898015976, + "step": 24070 + }, + { + "epoch": 8.032021347565044, + "loss": 0.4435, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "grad_norm": 5.342554569244385, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "learning_rate": 7.854009968083298e-05, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "loss": 0.5509377717971802, + "step": 24080 + }, + { + "ce_loss": 0.07140115648508072, + "epoch": 8.032021347565044, + "step": 24080 + }, + { + "distill_loss": 0.18408143520355225, + "epoch": 8.032021347565044, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "ref_ce_loss": 0.0856771245598793, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "loss": 0.5194607377052307, + "step": 24080 + }, + { + "ce_loss": 0.08500373363494873, + "epoch": 8.032021347565044, + "step": 24080 + }, + { + "distill_loss": 0.180271178483963, + "epoch": 8.032021347565044, + "step": 24080 + }, + { + "epoch": 8.032021347565044, + "ref_ce_loss": 0.07013080269098282, + "step": 24080 + }, + { + "epoch": 8.035356904603068, + "loss": 0.4726, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "grad_norm": 1.3187562227249146, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "learning_rate": 7.828312502476397e-05, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "loss": 0.34251484274864197, + "step": 24090 + }, + { + "ce_loss": 0.06243763864040375, + "epoch": 8.035356904603068, + "step": 24090 + }, + { + "distill_loss": 0.2157568335533142, + "epoch": 8.035356904603068, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "ref_ce_loss": 0.06419803947210312, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "loss": 0.33506128191947937, + "step": 24090 + }, + { + "ce_loss": 0.04710674658417702, + "epoch": 8.035356904603068, + "step": 24090 + }, + { + "distill_loss": 0.1622208207845688, + "epoch": 8.035356904603068, + "step": 24090 + }, + { + "epoch": 8.035356904603068, + "ref_ce_loss": 0.0668625682592392, + "step": 24090 + }, + { + "epoch": 8.038692461641094, + "loss": 0.3903, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "grad_norm": 1.0231257677078247, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "learning_rate": 7.802652584690626e-05, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "loss": 0.34682032465934753, + "step": 24100 + }, + { + "ce_loss": 0.074663445353508, + "epoch": 8.038692461641094, + "step": 24100 + }, + { + "distill_loss": 0.19481858611106873, + "epoch": 8.038692461641094, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "ref_ce_loss": 0.076902836561203, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "loss": 0.40327954292297363, + "step": 24100 + }, + { + "ce_loss": 0.06977633386850357, + "epoch": 8.038692461641094, + "step": 24100 + }, + { + "distill_loss": 0.20230472087860107, + "epoch": 8.038692461641094, + "step": 24100 + }, + { + "epoch": 8.038692461641094, + "ref_ce_loss": 0.09668795019388199, + "step": 24100 + }, + { + "epoch": 8.042028018679119, + "loss": 0.4016, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "grad_norm": 1.466809630393982, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "learning_rate": 7.777030244673862e-05, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "loss": 0.4166834354400635, + "step": 24110 + }, + { + "ce_loss": 0.029378900304436684, + "epoch": 8.042028018679119, + "step": 24110 + }, + { + "distill_loss": 0.20899909734725952, + "epoch": 8.042028018679119, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "ref_ce_loss": 0.0682293102145195, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "loss": 0.43382200598716736, + "step": 24110 + }, + { + "ce_loss": 0.0649435818195343, + "epoch": 8.042028018679119, + "step": 24110 + }, + { + "distill_loss": 0.18221789598464966, + "epoch": 8.042028018679119, + "step": 24110 + }, + { + "epoch": 8.042028018679119, + "ref_ce_loss": 0.07515498250722885, + "step": 24110 + }, + { + "epoch": 8.045363575717145, + "loss": 0.4193, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "grad_norm": 1.700887680053711, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "learning_rate": 7.751445512330149e-05, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "loss": 0.5536231994628906, + "step": 24120 + }, + { + "ce_loss": 0.09126371145248413, + "epoch": 8.045363575717145, + "step": 24120 + }, + { + "distill_loss": 0.2512921392917633, + "epoch": 8.045363575717145, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "ref_ce_loss": 0.09933537989854813, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "loss": 0.36023950576782227, + "step": 24120 + }, + { + "ce_loss": 0.06016818434000015, + "epoch": 8.045363575717145, + "step": 24120 + }, + { + "distill_loss": 0.1999480277299881, + "epoch": 8.045363575717145, + "step": 24120 + }, + { + "epoch": 8.045363575717145, + "ref_ce_loss": 0.06371976435184479, + "step": 24120 + }, + { + "epoch": 8.04869913275517, + "loss": 0.4155, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "grad_norm": 1.236303448677063, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "learning_rate": 7.725898417519601e-05, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "loss": 0.3462640643119812, + "step": 24130 + }, + { + "ce_loss": 0.08088426291942596, + "epoch": 8.04869913275517, + "step": 24130 + }, + { + "distill_loss": 0.20738530158996582, + "epoch": 8.04869913275517, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "ref_ce_loss": 0.04303397238254547, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "loss": 0.3455406725406647, + "step": 24130 + }, + { + "ce_loss": 0.03262823447585106, + "epoch": 8.04869913275517, + "step": 24130 + }, + { + "distill_loss": 0.1683516800403595, + "epoch": 8.04869913275517, + "step": 24130 + }, + { + "epoch": 8.04869913275517, + "ref_ce_loss": 0.07156498730182648, + "step": 24130 + }, + { + "epoch": 8.052034689793196, + "loss": 0.3823, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "grad_norm": 1.3769203424453735, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "learning_rate": 7.700388990058436e-05, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "loss": 0.28749528527259827, + "step": 24140 + }, + { + "ce_loss": 0.05094445124268532, + "epoch": 8.052034689793196, + "step": 24140 + }, + { + "distill_loss": 0.16284839808940887, + "epoch": 8.052034689793196, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "ref_ce_loss": 0.05306845158338547, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "loss": 0.5040078163146973, + "step": 24140 + }, + { + "ce_loss": 0.07084736973047256, + "epoch": 8.052034689793196, + "step": 24140 + }, + { + "distill_loss": 0.2221842110157013, + "epoch": 8.052034689793196, + "step": 24140 + }, + { + "epoch": 8.052034689793196, + "ref_ce_loss": 0.08755800873041153, + "step": 24140 + }, + { + "epoch": 8.05537024683122, + "loss": 0.4037, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "grad_norm": 1.3036253452301025, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "learning_rate": 7.674917259718903e-05, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "loss": 0.4268152117729187, + "step": 24150 + }, + { + "ce_loss": 0.05660969018936157, + "epoch": 8.05537024683122, + "step": 24150 + }, + { + "distill_loss": 0.1625378578901291, + "epoch": 8.05537024683122, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "ref_ce_loss": 0.0825534239411354, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "loss": 0.40700727701187134, + "step": 24150 + }, + { + "ce_loss": 0.042284682393074036, + "epoch": 8.05537024683122, + "step": 24150 + }, + { + "distill_loss": 0.2171652764081955, + "epoch": 8.05537024683122, + "step": 24150 + }, + { + "epoch": 8.05537024683122, + "ref_ce_loss": 0.07866621762514114, + "step": 24150 + }, + { + "epoch": 8.058705803869247, + "loss": 0.3902, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "grad_norm": 1.0506428480148315, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "learning_rate": 7.649483256229251e-05, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "loss": 0.3354337811470032, + "step": 24160 + }, + { + "ce_loss": 0.07376205921173096, + "epoch": 8.058705803869247, + "step": 24160 + }, + { + "distill_loss": 0.19081434607505798, + "epoch": 8.058705803869247, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "ref_ce_loss": 0.07030003517866135, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "loss": 0.3053566515445709, + "step": 24160 + }, + { + "ce_loss": 0.034138619899749756, + "epoch": 8.058705803869247, + "step": 24160 + }, + { + "distill_loss": 0.19562920928001404, + "epoch": 8.058705803869247, + "step": 24160 + }, + { + "epoch": 8.058705803869247, + "ref_ce_loss": 0.07531195878982544, + "step": 24160 + }, + { + "epoch": 8.062041360907271, + "loss": 0.4097, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "grad_norm": 2.4474267959594727, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "learning_rate": 7.624087009273707e-05, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "loss": 0.3829737901687622, + "step": 24170 + }, + { + "ce_loss": 0.0542147271335125, + "epoch": 8.062041360907271, + "step": 24170 + }, + { + "distill_loss": 0.21440204977989197, + "epoch": 8.062041360907271, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "ref_ce_loss": 0.0830206423997879, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "loss": 0.3945467174053192, + "step": 24170 + }, + { + "ce_loss": 0.05573704093694687, + "epoch": 8.062041360907271, + "step": 24170 + }, + { + "distill_loss": 0.2653251886367798, + "epoch": 8.062041360907271, + "step": 24170 + }, + { + "epoch": 8.062041360907271, + "ref_ce_loss": 0.0554734468460083, + "step": 24170 + }, + { + "epoch": 8.065376917945297, + "loss": 0.4454, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "grad_norm": 1.5293794870376587, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "learning_rate": 7.598728548492409e-05, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "loss": 0.38905197381973267, + "step": 24180 + }, + { + "ce_loss": 0.05039578303694725, + "epoch": 8.065376917945297, + "step": 24180 + }, + { + "distill_loss": 0.21358920633792877, + "epoch": 8.065376917945297, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "ref_ce_loss": 0.060911811888217926, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "loss": 0.3089878559112549, + "step": 24180 + }, + { + "ce_loss": 0.05048203095793724, + "epoch": 8.065376917945297, + "step": 24180 + }, + { + "distill_loss": 0.16113674640655518, + "epoch": 8.065376917945297, + "step": 24180 + }, + { + "epoch": 8.065376917945297, + "ref_ce_loss": 0.048411983996629715, + "step": 24180 + }, + { + "epoch": 8.068712474983322, + "loss": 0.4158, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "grad_norm": 2.302577257156372, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "learning_rate": 7.573407903481414e-05, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "loss": 0.3494682312011719, + "step": 24190 + }, + { + "ce_loss": 0.03522660210728645, + "epoch": 8.068712474983322, + "step": 24190 + }, + { + "distill_loss": 0.17571492493152618, + "epoch": 8.068712474983322, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "ref_ce_loss": 0.05610926076769829, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "loss": 0.3886391222476959, + "step": 24190 + }, + { + "ce_loss": 0.07231699675321579, + "epoch": 8.068712474983322, + "step": 24190 + }, + { + "distill_loss": 0.22073721885681152, + "epoch": 8.068712474983322, + "step": 24190 + }, + { + "epoch": 8.068712474983322, + "ref_ce_loss": 0.07439137250185013, + "step": 24190 + }, + { + "epoch": 8.072048032021348, + "loss": 0.4069, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "grad_norm": 1.2229115962982178, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "learning_rate": 7.54812510379264e-05, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "loss": 0.48251187801361084, + "step": 24200 + }, + { + "ce_loss": 0.07577910274267197, + "epoch": 8.072048032021348, + "step": 24200 + }, + { + "distill_loss": 0.21471984684467316, + "epoch": 8.072048032021348, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "ref_ce_loss": 0.07716456800699234, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "loss": 0.5132521390914917, + "step": 24200 + }, + { + "ce_loss": 0.11462902277708054, + "epoch": 8.072048032021348, + "step": 24200 + }, + { + "distill_loss": 0.21093082427978516, + "epoch": 8.072048032021348, + "step": 24200 + }, + { + "epoch": 8.072048032021348, + "ref_ce_loss": 0.08183901757001877, + "step": 24200 + }, + { + "epoch": 8.075383589059372, + "loss": 0.4118, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "grad_norm": 1.0189741849899292, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "learning_rate": 7.522880178933838e-05, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "loss": 0.33431535959243774, + "step": 24210 + }, + { + "ce_loss": 0.060533054172992706, + "epoch": 8.075383589059372, + "step": 24210 + }, + { + "distill_loss": 0.18593090772628784, + "epoch": 8.075383589059372, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "ref_ce_loss": 0.05473209545016289, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "loss": 0.40760648250579834, + "step": 24210 + }, + { + "ce_loss": 0.0526285283267498, + "epoch": 8.075383589059372, + "step": 24210 + }, + { + "distill_loss": 0.2151063233613968, + "epoch": 8.075383589059372, + "step": 24210 + }, + { + "epoch": 8.075383589059372, + "ref_ce_loss": 0.06521545350551605, + "step": 24210 + }, + { + "epoch": 8.078719146097399, + "loss": 0.4553, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "grad_norm": 1.1608573198318481, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "learning_rate": 7.497673158368547e-05, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "loss": 0.5061840415000916, + "step": 24220 + }, + { + "ce_loss": 0.0969315618276596, + "epoch": 8.078719146097399, + "step": 24220 + }, + { + "distill_loss": 0.2560502886772156, + "epoch": 8.078719146097399, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "ref_ce_loss": 0.1011023297905922, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "loss": 0.4152766168117523, + "step": 24220 + }, + { + "ce_loss": 0.0723932608962059, + "epoch": 8.078719146097399, + "step": 24220 + }, + { + "distill_loss": 0.19282567501068115, + "epoch": 8.078719146097399, + "step": 24220 + }, + { + "epoch": 8.078719146097399, + "ref_ce_loss": 0.09820578992366791, + "step": 24220 + }, + { + "epoch": 8.082054703135423, + "loss": 0.4628, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "grad_norm": 1.7025099992752075, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "learning_rate": 7.472504071516078e-05, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "loss": 0.3487241864204407, + "step": 24230 + }, + { + "ce_loss": 0.06950736790895462, + "epoch": 8.082054703135423, + "step": 24230 + }, + { + "distill_loss": 0.18906283378601074, + "epoch": 8.082054703135423, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "ref_ce_loss": 0.0899842232465744, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "loss": 0.46370089054107666, + "step": 24230 + }, + { + "ce_loss": 0.05824560672044754, + "epoch": 8.082054703135423, + "step": 24230 + }, + { + "distill_loss": 0.21629062294960022, + "epoch": 8.082054703135423, + "step": 24230 + }, + { + "epoch": 8.082054703135423, + "ref_ce_loss": 0.07762110233306885, + "step": 24230 + }, + { + "epoch": 8.08539026017345, + "loss": 0.4067, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "grad_norm": 1.4699956178665161, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "learning_rate": 7.447372947751468e-05, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "loss": 0.2601238489151001, + "step": 24240 + }, + { + "ce_loss": 0.019038693979382515, + "epoch": 8.08539026017345, + "step": 24240 + }, + { + "distill_loss": 0.1835421919822693, + "epoch": 8.08539026017345, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "ref_ce_loss": 0.057193856686353683, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "loss": 0.2840674817562103, + "step": 24240 + }, + { + "ce_loss": 0.04014355316758156, + "epoch": 8.08539026017345, + "step": 24240 + }, + { + "distill_loss": 0.17024365067481995, + "epoch": 8.08539026017345, + "step": 24240 + }, + { + "epoch": 8.08539026017345, + "ref_ce_loss": 0.054361842572689056, + "step": 24240 + }, + { + "epoch": 8.088725817211474, + "loss": 0.3839, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "grad_norm": 1.4079599380493164, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "learning_rate": 7.422279816405428e-05, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "loss": 0.25054410099983215, + "step": 24250 + }, + { + "ce_loss": 0.043765123933553696, + "epoch": 8.088725817211474, + "step": 24250 + }, + { + "distill_loss": 0.15811416506767273, + "epoch": 8.088725817211474, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "ref_ce_loss": 0.04835840314626694, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "loss": 0.31558120250701904, + "step": 24250 + }, + { + "ce_loss": 0.0415969081223011, + "epoch": 8.088725817211474, + "step": 24250 + }, + { + "distill_loss": 0.18933415412902832, + "epoch": 8.088725817211474, + "step": 24250 + }, + { + "epoch": 8.088725817211474, + "ref_ce_loss": 0.061390385031700134, + "step": 24250 + }, + { + "epoch": 8.0920613742495, + "loss": 0.4033, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "grad_norm": 1.592460036277771, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "learning_rate": 7.397224706764351e-05, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "loss": 0.5874719619750977, + "step": 24260 + }, + { + "ce_loss": 0.04022669792175293, + "epoch": 8.0920613742495, + "step": 24260 + }, + { + "distill_loss": 0.20707687735557556, + "epoch": 8.0920613742495, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "ref_ce_loss": 0.06614813953638077, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "loss": 0.39668652415275574, + "step": 24260 + }, + { + "ce_loss": 0.07553507387638092, + "epoch": 8.0920613742495, + "step": 24260 + }, + { + "distill_loss": 0.21756426990032196, + "epoch": 8.0920613742495, + "step": 24260 + }, + { + "epoch": 8.0920613742495, + "ref_ce_loss": 0.0744190663099289, + "step": 24260 + }, + { + "epoch": 8.095396931287524, + "loss": 0.445, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "grad_norm": 1.4449187517166138, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "learning_rate": 7.372207648070242e-05, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "loss": 0.33231836557388306, + "step": 24270 + }, + { + "ce_loss": 0.039733968675136566, + "epoch": 8.095396931287524, + "step": 24270 + }, + { + "distill_loss": 0.18775151669979095, + "epoch": 8.095396931287524, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "ref_ce_loss": 0.08045101910829544, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "loss": 0.552812933921814, + "step": 24270 + }, + { + "ce_loss": 0.046159908175468445, + "epoch": 8.095396931287524, + "step": 24270 + }, + { + "distill_loss": 0.2231673002243042, + "epoch": 8.095396931287524, + "step": 24270 + }, + { + "epoch": 8.095396931287524, + "ref_ce_loss": 0.05768783763051033, + "step": 24270 + }, + { + "epoch": 8.09873248832555, + "loss": 0.3826, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "grad_norm": 1.5561636686325073, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "learning_rate": 7.347228669520716e-05, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "loss": 0.33394551277160645, + "step": 24280 + }, + { + "ce_loss": 0.034008752554655075, + "epoch": 8.09873248832555, + "step": 24280 + }, + { + "distill_loss": 0.1595076322555542, + "epoch": 8.09873248832555, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "ref_ce_loss": 0.07015834003686905, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "loss": 0.6523812413215637, + "step": 24280 + }, + { + "ce_loss": 0.08651169389486313, + "epoch": 8.09873248832555, + "step": 24280 + }, + { + "distill_loss": 0.19471541047096252, + "epoch": 8.09873248832555, + "step": 24280 + }, + { + "epoch": 8.09873248832555, + "ref_ce_loss": 0.07609721273183823, + "step": 24280 + }, + { + "epoch": 8.102068045363575, + "loss": 0.382, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "grad_norm": 1.1743472814559937, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "learning_rate": 7.322287800268908e-05, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "loss": 0.3828940689563751, + "step": 24290 + }, + { + "ce_loss": 0.06921820342540741, + "epoch": 8.102068045363575, + "step": 24290 + }, + { + "distill_loss": 0.18600760400295258, + "epoch": 8.102068045363575, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "ref_ce_loss": 0.0913751944899559, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "loss": 0.2832377552986145, + "step": 24290 + }, + { + "ce_loss": 0.0540679506957531, + "epoch": 8.102068045363575, + "step": 24290 + }, + { + "distill_loss": 0.14307625591754913, + "epoch": 8.102068045363575, + "step": 24290 + }, + { + "epoch": 8.102068045363575, + "ref_ce_loss": 0.08550086617469788, + "step": 24290 + }, + { + "epoch": 8.105403602401601, + "loss": 0.3986, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "grad_norm": 1.3745160102844238, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "learning_rate": 7.297385069423502e-05, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "loss": 0.48997801542282104, + "step": 24300 + }, + { + "ce_loss": 0.04027820751070976, + "epoch": 8.105403602401601, + "step": 24300 + }, + { + "distill_loss": 0.19105367362499237, + "epoch": 8.105403602401601, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "ref_ce_loss": 0.056856460869312286, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "loss": 0.39413440227508545, + "step": 24300 + }, + { + "ce_loss": 0.06110284477472305, + "epoch": 8.105403602401601, + "step": 24300 + }, + { + "distill_loss": 0.20504486560821533, + "epoch": 8.105403602401601, + "step": 24300 + }, + { + "epoch": 8.105403602401601, + "ref_ce_loss": 0.07440733909606934, + "step": 24300 + }, + { + "epoch": 8.108739159439626, + "loss": 0.4335, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "grad_norm": 2.263183116912842, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "learning_rate": 7.272520506048653e-05, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "loss": 0.42609086632728577, + "step": 24310 + }, + { + "ce_loss": 0.06666922569274902, + "epoch": 8.108739159439626, + "step": 24310 + }, + { + "distill_loss": 0.2181270867586136, + "epoch": 8.108739159439626, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "ref_ce_loss": 0.08793104439973831, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "loss": 0.29217106103897095, + "step": 24310 + }, + { + "ce_loss": 0.030751807615160942, + "epoch": 8.108739159439626, + "step": 24310 + }, + { + "distill_loss": 0.1928568184375763, + "epoch": 8.108739159439626, + "step": 24310 + }, + { + "epoch": 8.108739159439626, + "ref_ce_loss": 0.04765959829092026, + "step": 24310 + }, + { + "epoch": 8.112074716477652, + "loss": 0.435, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "grad_norm": 1.2746638059616089, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "learning_rate": 7.247694139164023e-05, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "loss": 0.2549442648887634, + "step": 24320 + }, + { + "ce_loss": 0.03367112949490547, + "epoch": 8.112074716477652, + "step": 24320 + }, + { + "distill_loss": 0.16045795381069183, + "epoch": 8.112074716477652, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "ref_ce_loss": 0.060577113181352615, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "loss": 0.37653446197509766, + "step": 24320 + }, + { + "ce_loss": 0.08378375321626663, + "epoch": 8.112074716477652, + "step": 24320 + }, + { + "distill_loss": 0.18870356678962708, + "epoch": 8.112074716477652, + "step": 24320 + }, + { + "epoch": 8.112074716477652, + "ref_ce_loss": 0.07577314972877502, + "step": 24320 + }, + { + "epoch": 8.115410273515677, + "loss": 0.4013, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "grad_norm": 1.232866883277893, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "learning_rate": 7.22290599774461e-05, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "loss": 0.36797034740448, + "step": 24330 + }, + { + "ce_loss": 0.05298295617103577, + "epoch": 8.115410273515677, + "step": 24330 + }, + { + "distill_loss": 0.147117018699646, + "epoch": 8.115410273515677, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "ref_ce_loss": 0.10681744664907455, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "loss": 0.29472821950912476, + "step": 24330 + }, + { + "ce_loss": 0.0275767482817173, + "epoch": 8.115410273515677, + "step": 24330 + }, + { + "distill_loss": 0.19650474190711975, + "epoch": 8.115410273515677, + "step": 24330 + }, + { + "epoch": 8.115410273515677, + "ref_ce_loss": 0.053246621042490005, + "step": 24330 + }, + { + "epoch": 8.118745830553703, + "loss": 0.4056, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "grad_norm": 1.2795344591140747, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "learning_rate": 7.198156110720864e-05, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "loss": 0.334178626537323, + "step": 24340 + }, + { + "ce_loss": 0.03340744599699974, + "epoch": 8.118745830553703, + "step": 24340 + }, + { + "distill_loss": 0.17978930473327637, + "epoch": 8.118745830553703, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "ref_ce_loss": 0.08155786991119385, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "loss": 0.2408166229724884, + "step": 24340 + }, + { + "ce_loss": 0.029374677687883377, + "epoch": 8.118745830553703, + "step": 24340 + }, + { + "distill_loss": 0.14466525614261627, + "epoch": 8.118745830553703, + "step": 24340 + }, + { + "epoch": 8.118745830553703, + "ref_ce_loss": 0.04205675050616264, + "step": 24340 + }, + { + "epoch": 8.122081387591727, + "loss": 0.4237, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "grad_norm": 1.5818060636520386, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "learning_rate": 7.173444506978557e-05, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "loss": 0.32580411434173584, + "step": 24350 + }, + { + "ce_loss": 0.03970218077301979, + "epoch": 8.122081387591727, + "step": 24350 + }, + { + "distill_loss": 0.2042965441942215, + "epoch": 8.122081387591727, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "ref_ce_loss": 0.05940883979201317, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "loss": 0.4163911044597626, + "step": 24350 + }, + { + "ce_loss": 0.08233669400215149, + "epoch": 8.122081387591727, + "step": 24350 + }, + { + "distill_loss": 0.23903019726276398, + "epoch": 8.122081387591727, + "step": 24350 + }, + { + "epoch": 8.122081387591727, + "ref_ce_loss": 0.07528358697891235, + "step": 24350 + }, + { + "epoch": 8.125416944629754, + "loss": 0.4194, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "grad_norm": 1.5836371183395386, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "learning_rate": 7.14877121535881e-05, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "loss": 0.30909478664398193, + "step": 24360 + }, + { + "ce_loss": 0.031062021851539612, + "epoch": 8.125416944629754, + "step": 24360 + }, + { + "distill_loss": 0.19787774980068207, + "epoch": 8.125416944629754, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "ref_ce_loss": 0.07995710521936417, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "loss": 0.4969050884246826, + "step": 24360 + }, + { + "ce_loss": 0.04320429638028145, + "epoch": 8.125416944629754, + "step": 24360 + }, + { + "distill_loss": 0.1878196746110916, + "epoch": 8.125416944629754, + "step": 24360 + }, + { + "epoch": 8.125416944629754, + "ref_ce_loss": 0.07372452318668365, + "step": 24360 + }, + { + "epoch": 8.128752501667778, + "loss": 0.4068, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "grad_norm": 1.9463021755218506, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "learning_rate": 7.124136264657976e-05, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "loss": 0.4209724962711334, + "step": 24370 + }, + { + "ce_loss": 0.0419982448220253, + "epoch": 8.128752501667778, + "step": 24370 + }, + { + "distill_loss": 0.1879243701696396, + "epoch": 8.128752501667778, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "ref_ce_loss": 0.08443693071603775, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "loss": 0.39130768179893494, + "step": 24370 + }, + { + "ce_loss": 0.08258134126663208, + "epoch": 8.128752501667778, + "step": 24370 + }, + { + "distill_loss": 0.19457323849201202, + "epoch": 8.128752501667778, + "step": 24370 + }, + { + "epoch": 8.128752501667778, + "ref_ce_loss": 0.08725336939096451, + "step": 24370 + }, + { + "epoch": 8.132088058705804, + "loss": 0.3772, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "grad_norm": 1.717122197151184, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "learning_rate": 7.099539683627714e-05, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "loss": 0.41437631845474243, + "step": 24380 + }, + { + "ce_loss": 0.0480630062520504, + "epoch": 8.132088058705804, + "step": 24380 + }, + { + "distill_loss": 0.2810288965702057, + "epoch": 8.132088058705804, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "ref_ce_loss": 0.08493179082870483, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "loss": 0.3595638573169708, + "step": 24380 + }, + { + "ce_loss": 0.048516351729631424, + "epoch": 8.132088058705804, + "step": 24380 + }, + { + "distill_loss": 0.14217573404312134, + "epoch": 8.132088058705804, + "step": 24380 + }, + { + "epoch": 8.132088058705804, + "ref_ce_loss": 0.06873290985822678, + "step": 24380 + }, + { + "epoch": 8.135423615743829, + "loss": 0.4, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "grad_norm": 1.4674592018127441, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "learning_rate": 7.07498150097488e-05, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "loss": 0.40654972195625305, + "step": 24390 + }, + { + "ce_loss": 0.09343191236257553, + "epoch": 8.135423615743829, + "step": 24390 + }, + { + "distill_loss": 0.19275778532028198, + "epoch": 8.135423615743829, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "ref_ce_loss": 0.08056753128767014, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "loss": 0.44827786087989807, + "step": 24390 + }, + { + "ce_loss": 0.04539187625050545, + "epoch": 8.135423615743829, + "step": 24390 + }, + { + "distill_loss": 0.23888753354549408, + "epoch": 8.135423615743829, + "step": 24390 + }, + { + "epoch": 8.135423615743829, + "ref_ce_loss": 0.07933187484741211, + "step": 24390 + }, + { + "epoch": 8.138759172781855, + "loss": 0.4132, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "grad_norm": 0.793762743473053, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "learning_rate": 7.05046174536152e-05, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "loss": 0.37410956621170044, + "step": 24400 + }, + { + "ce_loss": 0.08392191678285599, + "epoch": 8.138759172781855, + "step": 24400 + }, + { + "distill_loss": 0.19799497723579407, + "epoch": 8.138759172781855, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "ref_ce_loss": 0.09189742058515549, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "loss": 0.3863672912120819, + "step": 24400 + }, + { + "ce_loss": 0.0856778547167778, + "epoch": 8.138759172781855, + "step": 24400 + }, + { + "distill_loss": 0.20787179470062256, + "epoch": 8.138759172781855, + "step": 24400 + }, + { + "epoch": 8.138759172781855, + "ref_ce_loss": 0.09266812354326248, + "step": 24400 + }, + { + "epoch": 8.14209472981988, + "loss": 0.3746, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "grad_norm": 1.1739890575408936, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "learning_rate": 7.025980445404811e-05, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "loss": 0.6801389455795288, + "step": 24410 + }, + { + "ce_loss": 0.09862788766622543, + "epoch": 8.14209472981988, + "step": 24410 + }, + { + "distill_loss": 0.2498609572649002, + "epoch": 8.14209472981988, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "ref_ce_loss": 0.09175879508256912, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "loss": 0.4861341714859009, + "step": 24410 + }, + { + "ce_loss": 0.06381076574325562, + "epoch": 8.14209472981988, + "step": 24410 + }, + { + "distill_loss": 0.23393845558166504, + "epoch": 8.14209472981988, + "step": 24410 + }, + { + "epoch": 8.14209472981988, + "ref_ce_loss": 0.07300575077533722, + "step": 24410 + }, + { + "epoch": 8.145430286857906, + "loss": 0.412, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "grad_norm": 2.2138359546661377, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "learning_rate": 7.001537629677061e-05, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "loss": 0.4213656187057495, + "step": 24420 + }, + { + "ce_loss": 0.08057521283626556, + "epoch": 8.145430286857906, + "step": 24420 + }, + { + "distill_loss": 0.2415209710597992, + "epoch": 8.145430286857906, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "ref_ce_loss": 0.07787420600652695, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "loss": 0.3960283100605011, + "step": 24420 + }, + { + "ce_loss": 0.06811496615409851, + "epoch": 8.145430286857906, + "step": 24420 + }, + { + "distill_loss": 0.2203001230955124, + "epoch": 8.145430286857906, + "step": 24420 + }, + { + "epoch": 8.145430286857906, + "ref_ce_loss": 0.0836324393749237, + "step": 24420 + }, + { + "epoch": 8.14876584389593, + "loss": 0.4497, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "grad_norm": 1.6273220777511597, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "learning_rate": 6.977133326705701e-05, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "loss": 0.4896760582923889, + "step": 24430 + }, + { + "ce_loss": 0.03350548446178436, + "epoch": 8.14876584389593, + "step": 24430 + }, + { + "distill_loss": 0.16317717730998993, + "epoch": 8.14876584389593, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "ref_ce_loss": 0.06565791368484497, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "loss": 0.5192831158638, + "step": 24430 + }, + { + "ce_loss": 0.08715259283781052, + "epoch": 8.14876584389593, + "step": 24430 + }, + { + "distill_loss": 0.22097237408161163, + "epoch": 8.14876584389593, + "step": 24430 + }, + { + "epoch": 8.14876584389593, + "ref_ce_loss": 0.06737726926803589, + "step": 24430 + }, + { + "epoch": 8.152101400933956, + "loss": 0.434, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "grad_norm": 1.2134703397750854, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "learning_rate": 6.952767564973149e-05, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "loss": 0.42709025740623474, + "step": 24440 + }, + { + "ce_loss": 0.06981752067804337, + "epoch": 8.152101400933956, + "step": 24440 + }, + { + "distill_loss": 0.26825565099716187, + "epoch": 8.152101400933956, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "ref_ce_loss": 0.06773539632558823, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "loss": 0.2758011221885681, + "step": 24440 + }, + { + "ce_loss": 0.03051314689218998, + "epoch": 8.152101400933956, + "step": 24440 + }, + { + "distill_loss": 0.17103226482868195, + "epoch": 8.152101400933956, + "step": 24440 + }, + { + "epoch": 8.152101400933956, + "ref_ce_loss": 0.07407353818416595, + "step": 24440 + }, + { + "epoch": 8.15543695797198, + "loss": 0.4523, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "grad_norm": 0.9241464734077454, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "learning_rate": 6.928440372916872e-05, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "loss": 0.3098130524158478, + "step": 24450 + }, + { + "ce_loss": 0.06479384750127792, + "epoch": 8.15543695797198, + "step": 24450 + }, + { + "distill_loss": 0.16387423872947693, + "epoch": 8.15543695797198, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "ref_ce_loss": 0.051972776651382446, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "loss": 0.26677024364471436, + "step": 24450 + }, + { + "ce_loss": 0.025871606543660164, + "epoch": 8.15543695797198, + "step": 24450 + }, + { + "distill_loss": 0.17901934683322906, + "epoch": 8.15543695797198, + "step": 24450 + }, + { + "epoch": 8.15543695797198, + "ref_ce_loss": 0.04680072143673897, + "step": 24450 + }, + { + "epoch": 8.158772515010007, + "loss": 0.4329, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "grad_norm": 1.5348268747329712, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "learning_rate": 6.904151778929331e-05, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "loss": 0.30024340748786926, + "step": 24460 + }, + { + "ce_loss": 0.0465410090982914, + "epoch": 8.158772515010007, + "step": 24460 + }, + { + "distill_loss": 0.17435558140277863, + "epoch": 8.158772515010007, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "ref_ce_loss": 0.07861968874931335, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "loss": 0.621793806552887, + "step": 24460 + }, + { + "ce_loss": 0.07838597893714905, + "epoch": 8.158772515010007, + "step": 24460 + }, + { + "distill_loss": 0.22223365306854248, + "epoch": 8.158772515010007, + "step": 24460 + }, + { + "epoch": 8.158772515010007, + "ref_ce_loss": 0.0761762410402298, + "step": 24460 + }, + { + "epoch": 8.162108072048031, + "loss": 0.4525, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "grad_norm": 0.9262333512306213, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "learning_rate": 6.879901811357931e-05, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "loss": 0.4530099034309387, + "step": 24470 + }, + { + "ce_loss": 0.07020189613103867, + "epoch": 8.162108072048031, + "step": 24470 + }, + { + "distill_loss": 0.2217385172843933, + "epoch": 8.162108072048031, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "ref_ce_loss": 0.07607980072498322, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "loss": 0.43743398785591125, + "step": 24470 + }, + { + "ce_loss": 0.10149736702442169, + "epoch": 8.162108072048031, + "step": 24470 + }, + { + "distill_loss": 0.26674532890319824, + "epoch": 8.162108072048031, + "step": 24470 + }, + { + "epoch": 8.162108072048031, + "ref_ce_loss": 0.06348459422588348, + "step": 24470 + }, + { + "epoch": 8.165443629086058, + "loss": 0.4397, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "grad_norm": 1.3768205642700195, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "learning_rate": 6.85569049850498e-05, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "loss": 0.3973981440067291, + "step": 24480 + }, + { + "ce_loss": 0.07804957032203674, + "epoch": 8.165443629086058, + "step": 24480 + }, + { + "distill_loss": 0.22942686080932617, + "epoch": 8.165443629086058, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "ref_ce_loss": 0.05899369344115257, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "loss": 0.6007339954376221, + "step": 24480 + }, + { + "ce_loss": 0.05822492390871048, + "epoch": 8.165443629086058, + "step": 24480 + }, + { + "distill_loss": 0.23770646750926971, + "epoch": 8.165443629086058, + "step": 24480 + }, + { + "epoch": 8.165443629086058, + "ref_ce_loss": 0.09923166036605835, + "step": 24480 + }, + { + "epoch": 8.168779186124082, + "loss": 0.4639, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "grad_norm": 1.7965821027755737, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "learning_rate": 6.831517868627693e-05, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "loss": 0.46187624335289, + "step": 24490 + }, + { + "ce_loss": 0.10025662183761597, + "epoch": 8.168779186124082, + "step": 24490 + }, + { + "distill_loss": 0.26213783025741577, + "epoch": 8.168779186124082, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "ref_ce_loss": 0.08308776468038559, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "loss": 0.6011335849761963, + "step": 24490 + }, + { + "ce_loss": 0.1039145365357399, + "epoch": 8.168779186124082, + "step": 24490 + }, + { + "distill_loss": 0.31061768531799316, + "epoch": 8.168779186124082, + "step": 24490 + }, + { + "epoch": 8.168779186124082, + "ref_ce_loss": 0.10056736320257187, + "step": 24490 + }, + { + "epoch": 8.172114743162108, + "loss": 0.4563, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "grad_norm": 1.7553499937057495, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "learning_rate": 6.80738394993813e-05, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "loss": 0.40037602186203003, + "step": 24500 + }, + { + "ce_loss": 0.10152076929807663, + "epoch": 8.172114743162108, + "step": 24500 + }, + { + "distill_loss": 0.21522164344787598, + "epoch": 8.172114743162108, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "ref_ce_loss": 0.08338447660207748, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "loss": 0.2687154710292816, + "step": 24500 + }, + { + "ce_loss": 0.03502975404262543, + "epoch": 8.172114743162108, + "step": 24500 + }, + { + "distill_loss": 0.16856245696544647, + "epoch": 8.172114743162108, + "step": 24500 + }, + { + "epoch": 8.172114743162108, + "ref_ce_loss": 0.04353145882487297, + "step": 24500 + }, + { + "epoch": 8.175450300200133, + "loss": 0.4511, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "grad_norm": 1.439299464225769, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "learning_rate": 6.783288770603169e-05, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "loss": 0.5104728937149048, + "step": 24510 + }, + { + "ce_loss": 0.06655392050743103, + "epoch": 8.175450300200133, + "step": 24510 + }, + { + "distill_loss": 0.2705954313278198, + "epoch": 8.175450300200133, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "ref_ce_loss": 0.08169260621070862, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "loss": 0.4592511057853699, + "step": 24510 + }, + { + "ce_loss": 0.06678766012191772, + "epoch": 8.175450300200133, + "step": 24510 + }, + { + "distill_loss": 0.23218107223510742, + "epoch": 8.175450300200133, + "step": 24510 + }, + { + "epoch": 8.175450300200133, + "ref_ce_loss": 0.07493381947278976, + "step": 24510 + }, + { + "epoch": 8.178785857238159, + "loss": 0.483, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "grad_norm": 1.1014498472213745, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "learning_rate": 6.75923235874449e-05, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "loss": 0.49110180139541626, + "step": 24520 + }, + { + "ce_loss": 0.102203369140625, + "epoch": 8.178785857238159, + "step": 24520 + }, + { + "distill_loss": 0.24019083380699158, + "epoch": 8.178785857238159, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "ref_ce_loss": 0.0786530077457428, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "loss": 0.7304254770278931, + "step": 24520 + }, + { + "ce_loss": 0.04971720278263092, + "epoch": 8.178785857238159, + "step": 24520 + }, + { + "distill_loss": 0.17906175553798676, + "epoch": 8.178785857238159, + "step": 24520 + }, + { + "epoch": 8.178785857238159, + "ref_ce_loss": 0.06416261941194534, + "step": 24520 + }, + { + "epoch": 8.182121414276184, + "loss": 0.4461, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "grad_norm": 1.209089756011963, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "learning_rate": 6.735214742438482e-05, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "loss": 0.8661839962005615, + "step": 24530 + }, + { + "ce_loss": 0.030243270099163055, + "epoch": 8.182121414276184, + "step": 24530 + }, + { + "distill_loss": 0.21225211024284363, + "epoch": 8.182121414276184, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "ref_ce_loss": 0.06496108323335648, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "loss": 0.5166357755661011, + "step": 24530 + }, + { + "ce_loss": 0.06101277470588684, + "epoch": 8.182121414276184, + "step": 24530 + }, + { + "distill_loss": 0.18440604209899902, + "epoch": 8.182121414276184, + "step": 24530 + }, + { + "epoch": 8.182121414276184, + "ref_ce_loss": 0.047844257205724716, + "step": 24530 + }, + { + "epoch": 8.18545697131421, + "loss": 0.4986, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "grad_norm": 1.5618125200271606, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "learning_rate": 6.711235949716316e-05, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "loss": 0.3254857361316681, + "step": 24540 + }, + { + "ce_loss": 0.03264736011624336, + "epoch": 8.18545697131421, + "step": 24540 + }, + { + "distill_loss": 0.1845029890537262, + "epoch": 8.18545697131421, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "ref_ce_loss": 0.05590250343084335, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "loss": 0.29291635751724243, + "step": 24540 + }, + { + "ce_loss": 0.03950078412890434, + "epoch": 8.18545697131421, + "step": 24540 + }, + { + "distill_loss": 0.19686773419380188, + "epoch": 8.18545697131421, + "step": 24540 + }, + { + "epoch": 8.18545697131421, + "ref_ce_loss": 0.05645206943154335, + "step": 24540 + }, + { + "epoch": 8.188792528352234, + "loss": 0.4274, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "grad_norm": 1.675386667251587, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "learning_rate": 6.687296008563828e-05, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "loss": 0.5986274480819702, + "step": 24550 + }, + { + "ce_loss": 0.03906279802322388, + "epoch": 8.188792528352234, + "step": 24550 + }, + { + "distill_loss": 0.19594983756542206, + "epoch": 8.188792528352234, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "ref_ce_loss": 0.0729498490691185, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "loss": 0.41088801622390747, + "step": 24550 + }, + { + "ce_loss": 0.03986736014485359, + "epoch": 8.188792528352234, + "step": 24550 + }, + { + "distill_loss": 0.21624164283275604, + "epoch": 8.188792528352234, + "step": 24550 + }, + { + "epoch": 8.188792528352234, + "ref_ce_loss": 0.08400541543960571, + "step": 24550 + }, + { + "epoch": 8.19212808539026, + "loss": 0.4294, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "grad_norm": 1.3876982927322388, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "learning_rate": 6.663394946921489e-05, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "loss": 0.3449614942073822, + "step": 24560 + }, + { + "ce_loss": 0.05582842603325844, + "epoch": 8.19212808539026, + "step": 24560 + }, + { + "distill_loss": 0.18621015548706055, + "epoch": 8.19212808539026, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "ref_ce_loss": 0.07234183698892593, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "loss": 0.5139371752738953, + "step": 24560 + }, + { + "ce_loss": 0.06884564459323883, + "epoch": 8.19212808539026, + "step": 24560 + }, + { + "distill_loss": 0.23937104642391205, + "epoch": 8.19212808539026, + "step": 24560 + }, + { + "epoch": 8.19212808539026, + "ref_ce_loss": 0.10060980170965195, + "step": 24560 + }, + { + "epoch": 8.195463642428285, + "loss": 0.3932, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "grad_norm": 1.207762360572815, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "learning_rate": 6.639532792684406e-05, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "loss": 0.3394352197647095, + "step": 24570 + }, + { + "ce_loss": 0.05197358503937721, + "epoch": 8.195463642428285, + "step": 24570 + }, + { + "distill_loss": 0.19060836732387543, + "epoch": 8.195463642428285, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "ref_ce_loss": 0.06508396565914154, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "loss": 0.40705007314682007, + "step": 24570 + }, + { + "ce_loss": 0.05480041354894638, + "epoch": 8.195463642428285, + "step": 24570 + }, + { + "distill_loss": 0.1943550407886505, + "epoch": 8.195463642428285, + "step": 24570 + }, + { + "epoch": 8.195463642428285, + "ref_ce_loss": 0.08861961960792542, + "step": 24570 + }, + { + "epoch": 8.198799199466311, + "loss": 0.3948, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "grad_norm": 1.828460693359375, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "learning_rate": 6.615709573702291e-05, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "loss": 0.3562234342098236, + "step": 24580 + }, + { + "ce_loss": 0.08233077079057693, + "epoch": 8.198799199466311, + "step": 24580 + }, + { + "distill_loss": 0.1983891725540161, + "epoch": 8.198799199466311, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "ref_ce_loss": 0.07498323917388916, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "loss": 0.33163464069366455, + "step": 24580 + }, + { + "ce_loss": 0.04026639088988304, + "epoch": 8.198799199466311, + "step": 24580 + }, + { + "distill_loss": 0.17978839576244354, + "epoch": 8.198799199466311, + "step": 24580 + }, + { + "epoch": 8.198799199466311, + "ref_ce_loss": 0.0723813846707344, + "step": 24580 + }, + { + "epoch": 8.202134756504336, + "loss": 0.4389, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "grad_norm": 2.6070868968963623, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "learning_rate": 6.591925317779412e-05, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "loss": 0.28905245661735535, + "step": 24590 + }, + { + "ce_loss": 0.02365030162036419, + "epoch": 8.202134756504336, + "step": 24590 + }, + { + "distill_loss": 0.18448412418365479, + "epoch": 8.202134756504336, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "ref_ce_loss": 0.060775455087423325, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "loss": 0.3054485619068146, + "step": 24590 + }, + { + "ce_loss": 0.05717819184064865, + "epoch": 8.202134756504336, + "step": 24590 + }, + { + "distill_loss": 0.13961108028888702, + "epoch": 8.202134756504336, + "step": 24590 + }, + { + "epoch": 8.202134756504336, + "ref_ce_loss": 0.07123158872127533, + "step": 24590 + }, + { + "epoch": 8.205470313542362, + "loss": 0.3917, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "grad_norm": 1.6249158382415771, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "learning_rate": 6.568180052674535e-05, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "loss": 0.41753503680229187, + "step": 24600 + }, + { + "ce_loss": 0.06948726624250412, + "epoch": 8.205470313542362, + "step": 24600 + }, + { + "distill_loss": 0.23197221755981445, + "epoch": 8.205470313542362, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "ref_ce_loss": 0.07913843542337418, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "loss": 0.37326788902282715, + "step": 24600 + }, + { + "ce_loss": 0.06086521968245506, + "epoch": 8.205470313542362, + "step": 24600 + }, + { + "distill_loss": 0.20718753337860107, + "epoch": 8.205470313542362, + "step": 24600 + }, + { + "epoch": 8.205470313542362, + "ref_ce_loss": 0.07859325408935547, + "step": 24600 + }, + { + "epoch": 8.208805870580386, + "loss": 0.4623, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "grad_norm": 1.1451125144958496, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "learning_rate": 6.544473806100953e-05, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "loss": 0.4168807864189148, + "step": 24610 + }, + { + "ce_loss": 0.08794716000556946, + "epoch": 8.208805870580386, + "step": 24610 + }, + { + "distill_loss": 0.17066535353660583, + "epoch": 8.208805870580386, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "ref_ce_loss": 0.06990622729063034, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "loss": 0.29761603474617004, + "step": 24610 + }, + { + "ce_loss": 0.05575099587440491, + "epoch": 8.208805870580386, + "step": 24610 + }, + { + "distill_loss": 0.18124572932720184, + "epoch": 8.208805870580386, + "step": 24610 + }, + { + "epoch": 8.208805870580386, + "ref_ce_loss": 0.060525815933942795, + "step": 24610 + }, + { + "epoch": 8.212141427618413, + "loss": 0.4032, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "grad_norm": 1.6889187097549438, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "learning_rate": 6.520806605726399e-05, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "loss": 0.3321390151977539, + "step": 24620 + }, + { + "ce_loss": 0.04720529541373253, + "epoch": 8.212141427618413, + "step": 24620 + }, + { + "distill_loss": 0.16193419694900513, + "epoch": 8.212141427618413, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "ref_ce_loss": 0.08601251244544983, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "loss": 0.33572375774383545, + "step": 24620 + }, + { + "ce_loss": 0.0730213150382042, + "epoch": 8.212141427618413, + "step": 24620 + }, + { + "distill_loss": 0.16905918717384338, + "epoch": 8.212141427618413, + "step": 24620 + }, + { + "epoch": 8.212141427618413, + "ref_ce_loss": 0.09331928193569183, + "step": 24620 + }, + { + "epoch": 8.215476984656437, + "loss": 0.3877, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "grad_norm": 1.0236163139343262, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "learning_rate": 6.497178479173056e-05, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "loss": 0.4932849407196045, + "step": 24630 + }, + { + "ce_loss": 0.04189939796924591, + "epoch": 8.215476984656437, + "step": 24630 + }, + { + "distill_loss": 0.1931006908416748, + "epoch": 8.215476984656437, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "ref_ce_loss": 0.08937612175941467, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "loss": 0.3568384051322937, + "step": 24630 + }, + { + "ce_loss": 0.0696152076125145, + "epoch": 8.215476984656437, + "step": 24630 + }, + { + "distill_loss": 0.18815016746520996, + "epoch": 8.215476984656437, + "step": 24630 + }, + { + "epoch": 8.215476984656437, + "ref_ce_loss": 0.06303508579730988, + "step": 24630 + }, + { + "epoch": 8.218812541694463, + "loss": 0.4584, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "grad_norm": 1.1400558948516846, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "learning_rate": 6.473589454017464e-05, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "loss": 0.6148601770401001, + "step": 24640 + }, + { + "ce_loss": 0.05197329819202423, + "epoch": 8.218812541694463, + "step": 24640 + }, + { + "distill_loss": 0.21115422248840332, + "epoch": 8.218812541694463, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "ref_ce_loss": 0.09626834839582443, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "loss": 0.3354915976524353, + "step": 24640 + }, + { + "ce_loss": 0.0600191093981266, + "epoch": 8.218812541694463, + "step": 24640 + }, + { + "distill_loss": 0.20265516638755798, + "epoch": 8.218812541694463, + "step": 24640 + }, + { + "epoch": 8.218812541694463, + "ref_ce_loss": 0.07247176766395569, + "step": 24640 + }, + { + "epoch": 8.222148098732488, + "loss": 0.4794, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "grad_norm": 1.4587262868881226, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "learning_rate": 6.450039557790577e-05, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "loss": 0.6026730537414551, + "step": 24650 + }, + { + "ce_loss": 0.08934365212917328, + "epoch": 8.222148098732488, + "step": 24650 + }, + { + "distill_loss": 0.26244986057281494, + "epoch": 8.222148098732488, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "ref_ce_loss": 0.08324063569307327, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "loss": 0.4086545705795288, + "step": 24650 + }, + { + "ce_loss": 0.06538889557123184, + "epoch": 8.222148098732488, + "step": 24650 + }, + { + "distill_loss": 0.18500006198883057, + "epoch": 8.222148098732488, + "step": 24650 + }, + { + "epoch": 8.222148098732488, + "ref_ce_loss": 0.07706859707832336, + "step": 24650 + }, + { + "epoch": 8.225483655770514, + "loss": 0.4254, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "grad_norm": 0.9976798892021179, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "learning_rate": 6.42652881797765e-05, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "loss": 0.35616809129714966, + "step": 24660 + }, + { + "ce_loss": 0.03588799387216568, + "epoch": 8.225483655770514, + "step": 24660 + }, + { + "distill_loss": 0.21862131357192993, + "epoch": 8.225483655770514, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "ref_ce_loss": 0.07292570918798447, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "loss": 0.41984128952026367, + "step": 24660 + }, + { + "ce_loss": 0.08020157366991043, + "epoch": 8.225483655770514, + "step": 24660 + }, + { + "distill_loss": 0.21562683582305908, + "epoch": 8.225483655770514, + "step": 24660 + }, + { + "epoch": 8.225483655770514, + "ref_ce_loss": 0.08471453934907913, + "step": 24660 + }, + { + "epoch": 8.228819212808538, + "loss": 0.389, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "grad_norm": 0.9158706665039062, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "learning_rate": 6.403057262018259e-05, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "loss": 0.3161277770996094, + "step": 24670 + }, + { + "ce_loss": 0.05249093845486641, + "epoch": 8.228819212808538, + "step": 24670 + }, + { + "distill_loss": 0.16578979790210724, + "epoch": 8.228819212808538, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "ref_ce_loss": 0.05899889022111893, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "loss": 0.36990371346473694, + "step": 24670 + }, + { + "ce_loss": 0.0632263645529747, + "epoch": 8.228819212808538, + "step": 24670 + }, + { + "distill_loss": 0.2002817690372467, + "epoch": 8.228819212808538, + "step": 24670 + }, + { + "epoch": 8.228819212808538, + "ref_ce_loss": 0.07740586251020432, + "step": 24670 + }, + { + "epoch": 8.232154769846565, + "loss": 0.418, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "grad_norm": 1.2354545593261719, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "learning_rate": 6.379624917306214e-05, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "loss": 0.40954896807670593, + "step": 24680 + }, + { + "ce_loss": 0.09720776975154877, + "epoch": 8.232154769846565, + "step": 24680 + }, + { + "distill_loss": 0.20144513249397278, + "epoch": 8.232154769846565, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "ref_ce_loss": 0.07739008963108063, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "loss": 0.2706911861896515, + "step": 24680 + }, + { + "ce_loss": 0.03822727128863335, + "epoch": 8.232154769846565, + "step": 24680 + }, + { + "distill_loss": 0.15807346999645233, + "epoch": 8.232154769846565, + "step": 24680 + }, + { + "epoch": 8.232154769846565, + "ref_ce_loss": 0.07413903623819351, + "step": 24680 + }, + { + "epoch": 8.23549032688459, + "loss": 0.4484, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "grad_norm": 1.1851049661636353, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "learning_rate": 6.356231811189593e-05, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "loss": 0.30205926299095154, + "step": 24690 + }, + { + "ce_loss": 0.0655367523431778, + "epoch": 8.23549032688459, + "step": 24690 + }, + { + "distill_loss": 0.16531744599342346, + "epoch": 8.23549032688459, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "ref_ce_loss": 0.07099558413028717, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "loss": 0.38187289237976074, + "step": 24690 + }, + { + "ce_loss": 0.04009734466671944, + "epoch": 8.23549032688459, + "step": 24690 + }, + { + "distill_loss": 0.1971883326768875, + "epoch": 8.23549032688459, + "step": 24690 + }, + { + "epoch": 8.23549032688459, + "ref_ce_loss": 0.0627526342868805, + "step": 24690 + }, + { + "epoch": 8.238825883922615, + "loss": 0.4152, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "grad_norm": 1.4340665340423584, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "learning_rate": 6.332877970970667e-05, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "loss": 0.41680845618247986, + "step": 24700 + }, + { + "ce_loss": 0.0698235034942627, + "epoch": 8.238825883922615, + "step": 24700 + }, + { + "distill_loss": 0.20547881722450256, + "epoch": 8.238825883922615, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "ref_ce_loss": 0.08544392883777618, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "loss": 0.3988693058490753, + "step": 24700 + }, + { + "ce_loss": 0.07071752101182938, + "epoch": 8.238825883922615, + "step": 24700 + }, + { + "distill_loss": 0.21910333633422852, + "epoch": 8.238825883922615, + "step": 24700 + }, + { + "epoch": 8.238825883922615, + "ref_ce_loss": 0.06319587677717209, + "step": 24700 + }, + { + "epoch": 8.24216144096064, + "loss": 0.4185, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "grad_norm": 1.073905110359192, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "learning_rate": 6.309563423905891e-05, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "loss": 0.46671733260154724, + "step": 24710 + }, + { + "ce_loss": 0.047412920743227005, + "epoch": 8.24216144096064, + "step": 24710 + }, + { + "distill_loss": 0.21610336005687714, + "epoch": 8.24216144096064, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "ref_ce_loss": 0.05106896907091141, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "loss": 0.6524374485015869, + "step": 24710 + }, + { + "ce_loss": 0.07691330462694168, + "epoch": 8.24216144096064, + "step": 24710 + }, + { + "distill_loss": 0.21490544080734253, + "epoch": 8.24216144096064, + "step": 24710 + }, + { + "epoch": 8.24216144096064, + "ref_ce_loss": 0.11045210808515549, + "step": 24710 + }, + { + "epoch": 8.245496997998666, + "loss": 0.4092, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "grad_norm": 1.0162745714187622, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "learning_rate": 6.286288197205834e-05, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "loss": 0.6139237880706787, + "step": 24720 + }, + { + "ce_loss": 0.0492384098470211, + "epoch": 8.245496997998666, + "step": 24720 + }, + { + "distill_loss": 0.22562669217586517, + "epoch": 8.245496997998666, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "ref_ce_loss": 0.07977955788373947, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "loss": 0.31097501516342163, + "step": 24720 + }, + { + "ce_loss": 0.04455602914094925, + "epoch": 8.245496997998666, + "step": 24720 + }, + { + "distill_loss": 0.17696070671081543, + "epoch": 8.245496997998666, + "step": 24720 + }, + { + "epoch": 8.245496997998666, + "ref_ce_loss": 0.06480378657579422, + "step": 24720 + }, + { + "epoch": 8.24883255503669, + "loss": 0.4482, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "grad_norm": 1.1851487159729004, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "learning_rate": 6.26305231803519e-05, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "loss": 0.37734466791152954, + "step": 24730 + }, + { + "ce_loss": 0.06257500499486923, + "epoch": 8.24883255503669, + "step": 24730 + }, + { + "distill_loss": 0.20443084836006165, + "epoch": 8.24883255503669, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "ref_ce_loss": 0.08424118161201477, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "loss": 0.28724437952041626, + "step": 24730 + }, + { + "ce_loss": 0.035595282912254333, + "epoch": 8.24883255503669, + "step": 24730 + }, + { + "distill_loss": 0.1421755701303482, + "epoch": 8.24883255503669, + "step": 24730 + }, + { + "epoch": 8.24883255503669, + "ref_ce_loss": 0.06486207991838455, + "step": 24730 + }, + { + "epoch": 8.252168112074717, + "loss": 0.3924, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "grad_norm": 1.8277186155319214, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "learning_rate": 6.239855813512741e-05, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "loss": 0.5576151013374329, + "step": 24740 + }, + { + "ce_loss": 0.07668827474117279, + "epoch": 8.252168112074717, + "step": 24740 + }, + { + "distill_loss": 0.2058444321155548, + "epoch": 8.252168112074717, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "ref_ce_loss": 0.09041954576969147, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "loss": 0.5620578527450562, + "step": 24740 + }, + { + "ce_loss": 0.10907970368862152, + "epoch": 8.252168112074717, + "step": 24740 + }, + { + "distill_loss": 0.2825700044631958, + "epoch": 8.252168112074717, + "step": 24740 + }, + { + "epoch": 8.252168112074717, + "ref_ce_loss": 0.08636128902435303, + "step": 24740 + }, + { + "epoch": 8.255503669112741, + "loss": 0.4372, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "grad_norm": 2.166013717651367, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "learning_rate": 6.216698710711297e-05, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "loss": 0.5015909671783447, + "step": 24750 + }, + { + "ce_loss": 0.07607283443212509, + "epoch": 8.255503669112741, + "step": 24750 + }, + { + "distill_loss": 0.24176400899887085, + "epoch": 8.255503669112741, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "ref_ce_loss": 0.0735190212726593, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "loss": 0.4340944290161133, + "step": 24750 + }, + { + "ce_loss": 0.04279591515660286, + "epoch": 8.255503669112741, + "step": 24750 + }, + { + "distill_loss": 0.21969415247440338, + "epoch": 8.255503669112741, + "step": 24750 + }, + { + "epoch": 8.255503669112741, + "ref_ce_loss": 0.07828209549188614, + "step": 24750 + }, + { + "epoch": 8.258839226150767, + "loss": 0.4027, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "grad_norm": 1.5689845085144043, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "learning_rate": 6.193581036657694e-05, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "loss": 0.3352030813694, + "step": 24760 + }, + { + "ce_loss": 0.08149098604917526, + "epoch": 8.258839226150767, + "step": 24760 + }, + { + "distill_loss": 0.17032968997955322, + "epoch": 8.258839226150767, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "ref_ce_loss": 0.08317124843597412, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "loss": 0.2671343982219696, + "step": 24760 + }, + { + "ce_loss": 0.04549345001578331, + "epoch": 8.258839226150767, + "step": 24760 + }, + { + "distill_loss": 0.158750981092453, + "epoch": 8.258839226150767, + "step": 24760 + }, + { + "epoch": 8.258839226150767, + "ref_ce_loss": 0.06273841112852097, + "step": 24760 + }, + { + "epoch": 8.262174783188792, + "loss": 0.4228, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "grad_norm": 1.2682881355285645, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "learning_rate": 6.17050281833274e-05, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "loss": 0.4441709816455841, + "step": 24770 + }, + { + "ce_loss": 0.058845460414886475, + "epoch": 8.262174783188792, + "step": 24770 + }, + { + "distill_loss": 0.2046021968126297, + "epoch": 8.262174783188792, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "ref_ce_loss": 0.09121354669332504, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "loss": 0.39193737506866455, + "step": 24770 + }, + { + "ce_loss": 0.06523241847753525, + "epoch": 8.262174783188792, + "step": 24770 + }, + { + "distill_loss": 0.20892859995365143, + "epoch": 8.262174783188792, + "step": 24770 + }, + { + "epoch": 8.262174783188792, + "ref_ce_loss": 0.09341893345117569, + "step": 24770 + }, + { + "epoch": 8.265510340226818, + "loss": 0.3767, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "grad_norm": 1.4384493827819824, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "learning_rate": 6.147464082671213e-05, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "loss": 0.2718307375907898, + "step": 24780 + }, + { + "ce_loss": 0.04708772897720337, + "epoch": 8.265510340226818, + "step": 24780 + }, + { + "distill_loss": 0.17009615898132324, + "epoch": 8.265510340226818, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "ref_ce_loss": 0.054384294897317886, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "loss": 0.529242992401123, + "step": 24780 + }, + { + "ce_loss": 0.0579858236014843, + "epoch": 8.265510340226818, + "step": 24780 + }, + { + "distill_loss": 0.19909480214118958, + "epoch": 8.265510340226818, + "step": 24780 + }, + { + "epoch": 8.265510340226818, + "ref_ce_loss": 0.08572480082511902, + "step": 24780 + }, + { + "epoch": 8.268845897264843, + "loss": 0.3977, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "grad_norm": 0.9916096925735474, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "learning_rate": 6.124464856561774e-05, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "loss": 0.4032076299190521, + "step": 24790 + }, + { + "ce_loss": 0.09457050263881683, + "epoch": 8.268845897264843, + "step": 24790 + }, + { + "distill_loss": 0.21368515491485596, + "epoch": 8.268845897264843, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "ref_ce_loss": 0.07465735077857971, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "loss": 0.3135150671005249, + "step": 24790 + }, + { + "ce_loss": 0.045039501041173935, + "epoch": 8.268845897264843, + "step": 24790 + }, + { + "distill_loss": 0.17188379168510437, + "epoch": 8.268845897264843, + "step": 24790 + }, + { + "epoch": 8.268845897264843, + "ref_ce_loss": 0.06974980980157852, + "step": 24790 + }, + { + "epoch": 8.272181454302869, + "loss": 0.4001, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "grad_norm": 1.6688566207885742, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "learning_rate": 6.101505166847008e-05, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "loss": 0.32536888122558594, + "step": 24800 + }, + { + "ce_loss": 0.06018342077732086, + "epoch": 8.272181454302869, + "step": 24800 + }, + { + "distill_loss": 0.18512201309204102, + "epoch": 8.272181454302869, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "ref_ce_loss": 0.07981298118829727, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "loss": 0.24707356095314026, + "step": 24800 + }, + { + "ce_loss": 0.03561161831021309, + "epoch": 8.272181454302869, + "step": 24800 + }, + { + "distill_loss": 0.12160885334014893, + "epoch": 8.272181454302869, + "step": 24800 + }, + { + "epoch": 8.272181454302869, + "ref_ce_loss": 0.06553605198860168, + "step": 24800 + }, + { + "epoch": 8.275517011340893, + "loss": 0.3612, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "grad_norm": 0.9926279187202454, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "learning_rate": 6.078585040323339e-05, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "loss": 0.4812214970588684, + "step": 24810 + }, + { + "ce_loss": 0.057430241256952286, + "epoch": 8.275517011340893, + "step": 24810 + }, + { + "distill_loss": 0.22909767925739288, + "epoch": 8.275517011340893, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "ref_ce_loss": 0.11038453131914139, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "loss": 0.28939327597618103, + "step": 24810 + }, + { + "ce_loss": 0.04220176488161087, + "epoch": 8.275517011340893, + "step": 24810 + }, + { + "distill_loss": 0.15939894318580627, + "epoch": 8.275517011340893, + "step": 24810 + }, + { + "epoch": 8.275517011340893, + "ref_ce_loss": 0.05957774072885513, + "step": 24810 + }, + { + "epoch": 8.27885256837892, + "loss": 0.3925, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "grad_norm": 1.9365723133087158, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "learning_rate": 6.0557045037410355e-05, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "loss": 0.32409194111824036, + "step": 24820 + }, + { + "ce_loss": 0.06542002409696579, + "epoch": 8.27885256837892, + "step": 24820 + }, + { + "distill_loss": 0.17851068079471588, + "epoch": 8.27885256837892, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "ref_ce_loss": 0.05876639112830162, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "loss": 0.43060964345932007, + "step": 24820 + }, + { + "ce_loss": 0.11308394372463226, + "epoch": 8.27885256837892, + "step": 24820 + }, + { + "distill_loss": 0.19883199036121368, + "epoch": 8.27885256837892, + "step": 24820 + }, + { + "epoch": 8.27885256837892, + "ref_ce_loss": 0.08647770434617996, + "step": 24820 + }, + { + "epoch": 8.282188125416944, + "loss": 0.3985, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "grad_norm": 2.2099313735961914, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "learning_rate": 6.0328635838041224e-05, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "loss": 0.3571459650993347, + "step": 24830 + }, + { + "ce_loss": 0.0637267604470253, + "epoch": 8.282188125416944, + "step": 24830 + }, + { + "distill_loss": 0.16139563918113708, + "epoch": 8.282188125416944, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "ref_ce_loss": 0.07349435240030289, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "loss": 0.36667025089263916, + "step": 24830 + }, + { + "ce_loss": 0.04978648200631142, + "epoch": 8.282188125416944, + "step": 24830 + }, + { + "distill_loss": 0.19708067178726196, + "epoch": 8.282188125416944, + "step": 24830 + }, + { + "epoch": 8.282188125416944, + "ref_ce_loss": 0.06627427786588669, + "step": 24830 + }, + { + "epoch": 8.28552368245497, + "loss": 0.4546, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "grad_norm": 1.1406466960906982, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "learning_rate": 6.0100623071704186e-05, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "loss": 0.551635205745697, + "step": 24840 + }, + { + "ce_loss": 0.06963831186294556, + "epoch": 8.28552368245497, + "step": 24840 + }, + { + "distill_loss": 0.23795580863952637, + "epoch": 8.28552368245497, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "ref_ce_loss": 0.0977717936038971, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "loss": 0.36534714698791504, + "step": 24840 + }, + { + "ce_loss": 0.06684930622577667, + "epoch": 8.28552368245497, + "step": 24840 + }, + { + "distill_loss": 0.17508380115032196, + "epoch": 8.28552368245497, + "step": 24840 + }, + { + "epoch": 8.28552368245497, + "ref_ce_loss": 0.08289922773838043, + "step": 24840 + }, + { + "epoch": 8.288859239492995, + "loss": 0.3985, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "grad_norm": 0.8758309483528137, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "learning_rate": 5.987300700451477e-05, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "loss": 0.2846289277076721, + "step": 24850 + }, + { + "ce_loss": 0.04146531596779823, + "epoch": 8.288859239492995, + "step": 24850 + }, + { + "distill_loss": 0.18570080399513245, + "epoch": 8.288859239492995, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "ref_ce_loss": 0.05710703507065773, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "loss": 0.3122004568576813, + "step": 24850 + }, + { + "ce_loss": 0.022249093279242516, + "epoch": 8.288859239492995, + "step": 24850 + }, + { + "distill_loss": 0.1364220380783081, + "epoch": 8.288859239492995, + "step": 24850 + }, + { + "epoch": 8.288859239492995, + "ref_ce_loss": 0.050004757940769196, + "step": 24850 + }, + { + "epoch": 8.292194796531021, + "loss": 0.3917, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "grad_norm": 1.4198230504989624, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "learning_rate": 5.9645787902125314e-05, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "loss": 0.4237378239631653, + "step": 24860 + }, + { + "ce_loss": 0.053649380803108215, + "epoch": 8.292194796531021, + "step": 24860 + }, + { + "distill_loss": 0.2529350519180298, + "epoch": 8.292194796531021, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "ref_ce_loss": 0.07835126668214798, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "loss": 0.4145413637161255, + "step": 24860 + }, + { + "ce_loss": 0.0244712233543396, + "epoch": 8.292194796531021, + "step": 24860 + }, + { + "distill_loss": 0.16361355781555176, + "epoch": 8.292194796531021, + "step": 24860 + }, + { + "epoch": 8.292194796531021, + "ref_ce_loss": 0.060614414513111115, + "step": 24860 + }, + { + "epoch": 8.295530353569045, + "loss": 0.3863, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "grad_norm": 1.1518481969833374, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "learning_rate": 5.941896602972503e-05, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "loss": 0.30724936723709106, + "step": 24870 + }, + { + "ce_loss": 0.030511852353811264, + "epoch": 8.295530353569045, + "step": 24870 + }, + { + "distill_loss": 0.1536020040512085, + "epoch": 8.295530353569045, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "ref_ce_loss": 0.06268143653869629, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "loss": 0.404881089925766, + "step": 24870 + }, + { + "ce_loss": 0.07836263626813889, + "epoch": 8.295530353569045, + "step": 24870 + }, + { + "distill_loss": 0.23570379614830017, + "epoch": 8.295530353569045, + "step": 24870 + }, + { + "epoch": 8.295530353569045, + "ref_ce_loss": 0.06730661541223526, + "step": 24870 + }, + { + "epoch": 8.298865910607072, + "loss": 0.412, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "grad_norm": 0.9654243588447571, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "learning_rate": 5.91925416520394e-05, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "loss": 0.40532171726226807, + "step": 24880 + }, + { + "ce_loss": 0.06659356504678726, + "epoch": 8.298865910607072, + "step": 24880 + }, + { + "distill_loss": 0.23507378995418549, + "epoch": 8.298865910607072, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "ref_ce_loss": 0.10342125594615936, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "loss": 0.5091399550437927, + "step": 24880 + }, + { + "ce_loss": 0.07355561852455139, + "epoch": 8.298865910607072, + "step": 24880 + }, + { + "distill_loss": 0.22285902500152588, + "epoch": 8.298865910607072, + "step": 24880 + }, + { + "epoch": 8.298865910607072, + "ref_ce_loss": 0.08281935751438141, + "step": 24880 + }, + { + "epoch": 8.302201467645096, + "loss": 0.4575, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "grad_norm": 1.3964868783950806, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "learning_rate": 5.89665150333301e-05, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "loss": 0.7368953227996826, + "step": 24890 + }, + { + "ce_loss": 0.08275753259658813, + "epoch": 8.302201467645096, + "step": 24890 + }, + { + "distill_loss": 0.2730520963668823, + "epoch": 8.302201467645096, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "ref_ce_loss": 0.0967688113451004, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "loss": 0.3053659200668335, + "step": 24890 + }, + { + "ce_loss": 0.053972817957401276, + "epoch": 8.302201467645096, + "step": 24890 + }, + { + "distill_loss": 0.16566911339759827, + "epoch": 8.302201467645096, + "step": 24890 + }, + { + "epoch": 8.302201467645096, + "ref_ce_loss": 0.06530603021383286, + "step": 24890 + }, + { + "epoch": 8.305537024683122, + "loss": 0.4027, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "grad_norm": 1.1332793235778809, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "learning_rate": 5.8740886437394526e-05, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "loss": 0.5145341157913208, + "step": 24900 + }, + { + "ce_loss": 0.06977076828479767, + "epoch": 8.305537024683122, + "step": 24900 + }, + { + "distill_loss": 0.20493197441101074, + "epoch": 8.305537024683122, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "ref_ce_loss": 0.08403408527374268, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "loss": 0.42330917716026306, + "step": 24900 + }, + { + "ce_loss": 0.09833862632513046, + "epoch": 8.305537024683122, + "step": 24900 + }, + { + "distill_loss": 0.2172425538301468, + "epoch": 8.305537024683122, + "step": 24900 + }, + { + "epoch": 8.305537024683122, + "ref_ce_loss": 0.07472098618745804, + "step": 24900 + }, + { + "epoch": 8.308872581721147, + "loss": 0.3979, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "grad_norm": 1.4151992797851562, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "learning_rate": 5.8515656127565445e-05, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "loss": 0.408204585313797, + "step": 24910 + }, + { + "ce_loss": 0.07390382885932922, + "epoch": 8.308872581721147, + "step": 24910 + }, + { + "distill_loss": 0.2057056427001953, + "epoch": 8.308872581721147, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "ref_ce_loss": 0.08743388950824738, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "loss": 0.35966911911964417, + "step": 24910 + }, + { + "ce_loss": 0.05391908437013626, + "epoch": 8.308872581721147, + "step": 24910 + }, + { + "distill_loss": 0.23523716628551483, + "epoch": 8.308872581721147, + "step": 24910 + }, + { + "epoch": 8.308872581721147, + "ref_ce_loss": 0.047440554946660995, + "step": 24910 + }, + { + "epoch": 8.312208138759173, + "loss": 0.4019, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "grad_norm": 1.5864213705062866, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "learning_rate": 5.829082436671085e-05, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "loss": 0.3313003480434418, + "step": 24920 + }, + { + "ce_loss": 0.027458081021904945, + "epoch": 8.312208138759173, + "step": 24920 + }, + { + "distill_loss": 0.1620972454547882, + "epoch": 8.312208138759173, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "ref_ce_loss": 0.052886199206113815, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "loss": 0.4817555844783783, + "step": 24920 + }, + { + "ce_loss": 0.06159020587801933, + "epoch": 8.312208138759173, + "step": 24920 + }, + { + "distill_loss": 0.18925221264362335, + "epoch": 8.312208138759173, + "step": 24920 + }, + { + "epoch": 8.312208138759173, + "ref_ce_loss": 0.0889902412891388, + "step": 24920 + }, + { + "epoch": 8.315543695797198, + "loss": 0.4349, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "grad_norm": 0.8452260494232178, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "learning_rate": 5.806639141723364e-05, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "loss": 0.2972012460231781, + "step": 24930 + }, + { + "ce_loss": 0.07060505449771881, + "epoch": 8.315543695797198, + "step": 24930 + }, + { + "distill_loss": 0.15588009357452393, + "epoch": 8.315543695797198, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "ref_ce_loss": 0.07062255591154099, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "loss": 0.29503345489501953, + "step": 24930 + }, + { + "ce_loss": 0.03660615533590317, + "epoch": 8.315543695797198, + "step": 24930 + }, + { + "distill_loss": 0.1910068839788437, + "epoch": 8.315543695797198, + "step": 24930 + }, + { + "epoch": 8.315543695797198, + "ref_ce_loss": 0.052066173404455185, + "step": 24930 + }, + { + "epoch": 8.318879252835224, + "loss": 0.4054, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "grad_norm": 1.2340115308761597, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "learning_rate": 5.784235754107137e-05, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "loss": 0.3457760810852051, + "step": 24940 + }, + { + "ce_loss": 0.031064603477716446, + "epoch": 8.318879252835224, + "step": 24940 + }, + { + "distill_loss": 0.15679000318050385, + "epoch": 8.318879252835224, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "ref_ce_loss": 0.06945052742958069, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "loss": 0.3431711792945862, + "step": 24940 + }, + { + "ce_loss": 0.03292974457144737, + "epoch": 8.318879252835224, + "step": 24940 + }, + { + "distill_loss": 0.1323884278535843, + "epoch": 8.318879252835224, + "step": 24940 + }, + { + "epoch": 8.318879252835224, + "ref_ce_loss": 0.06366308033466339, + "step": 24940 + }, + { + "epoch": 8.322214809873248, + "loss": 0.3784, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "grad_norm": 3.7198262214660645, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "learning_rate": 5.7618722999695394e-05, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "loss": 0.38841986656188965, + "step": 24950 + }, + { + "ce_loss": 0.08898656815290451, + "epoch": 8.322214809873248, + "step": 24950 + }, + { + "distill_loss": 0.19516724348068237, + "epoch": 8.322214809873248, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "ref_ce_loss": 0.07287117093801498, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "loss": 0.35441359877586365, + "step": 24950 + }, + { + "ce_loss": 0.05294131487607956, + "epoch": 8.322214809873248, + "step": 24950 + }, + { + "distill_loss": 0.2164778709411621, + "epoch": 8.322214809873248, + "step": 24950 + }, + { + "epoch": 8.322214809873248, + "ref_ce_loss": 0.06361507624387741, + "step": 24950 + }, + { + "epoch": 8.325550366911274, + "loss": 0.411, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "grad_norm": 1.1434744596481323, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "learning_rate": 5.7395488054111436e-05, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "loss": 0.2941511571407318, + "step": 24960 + }, + { + "ce_loss": 0.052058592438697815, + "epoch": 8.325550366911274, + "step": 24960 + }, + { + "distill_loss": 0.17279891669750214, + "epoch": 8.325550366911274, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "ref_ce_loss": 0.06911136209964752, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "loss": 0.30970582365989685, + "step": 24960 + }, + { + "ce_loss": 0.04044627025723457, + "epoch": 8.325550366911274, + "step": 24960 + }, + { + "distill_loss": 0.17380879819393158, + "epoch": 8.325550366911274, + "step": 24960 + }, + { + "epoch": 8.325550366911274, + "ref_ce_loss": 0.0723646730184555, + "step": 24960 + }, + { + "epoch": 8.328885923949299, + "loss": 0.3814, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "grad_norm": 4.01048469543457, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "learning_rate": 5.717265296485872e-05, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "loss": 0.4379655122756958, + "step": 24970 + }, + { + "ce_loss": 0.053386520594358444, + "epoch": 8.328885923949299, + "step": 24970 + }, + { + "distill_loss": 0.1718374490737915, + "epoch": 8.328885923949299, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "ref_ce_loss": 0.06681462377309799, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "loss": 0.45380282402038574, + "step": 24970 + }, + { + "ce_loss": 0.08911381661891937, + "epoch": 8.328885923949299, + "step": 24970 + }, + { + "distill_loss": 0.23314177989959717, + "epoch": 8.328885923949299, + "step": 24970 + }, + { + "epoch": 8.328885923949299, + "ref_ce_loss": 0.06985948234796524, + "step": 24970 + }, + { + "epoch": 8.332221480987325, + "loss": 0.3751, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "grad_norm": 1.6863759756088257, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "learning_rate": 5.69502179920097e-05, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "loss": 0.32353562116622925, + "step": 24980 + }, + { + "ce_loss": 0.06093441694974899, + "epoch": 8.332221480987325, + "step": 24980 + }, + { + "distill_loss": 0.15882942080497742, + "epoch": 8.332221480987325, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "ref_ce_loss": 0.07053432613611221, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "loss": 0.4023553431034088, + "step": 24980 + }, + { + "ce_loss": 0.08936289697885513, + "epoch": 8.332221480987325, + "step": 24980 + }, + { + "distill_loss": 0.20839953422546387, + "epoch": 8.332221480987325, + "step": 24980 + }, + { + "epoch": 8.332221480987325, + "ref_ce_loss": 0.07393033057451248, + "step": 24980 + }, + { + "epoch": 8.33555703802535, + "loss": 0.4224, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "grad_norm": 0.9262280464172363, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "learning_rate": 5.672818339517001e-05, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "loss": 0.3177008032798767, + "step": 24990 + }, + { + "ce_loss": 0.06413191556930542, + "epoch": 8.33555703802535, + "step": 24990 + }, + { + "distill_loss": 0.15126436948776245, + "epoch": 8.33555703802535, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "ref_ce_loss": 0.10207774490118027, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "loss": 0.3172096312046051, + "step": 24990 + }, + { + "ce_loss": 0.04232507944107056, + "epoch": 8.33555703802535, + "step": 24990 + }, + { + "distill_loss": 0.19980588555335999, + "epoch": 8.33555703802535, + "step": 24990 + }, + { + "epoch": 8.33555703802535, + "ref_ce_loss": 0.07482916116714478, + "step": 24990 + }, + { + "epoch": 8.338892595063376, + "loss": 0.4413, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "grad_norm": 1.1610908508300781, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "learning_rate": 5.6506549433477865e-05, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "loss": 1.097810983657837, + "step": 25000 + }, + { + "ce_loss": 0.05602821335196495, + "epoch": 8.338892595063376, + "step": 25000 + }, + { + "distill_loss": 0.18375152349472046, + "epoch": 8.338892595063376, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "ref_ce_loss": 0.05691630393266678, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "loss": 0.3631144165992737, + "step": 25000 + }, + { + "ce_loss": 0.06245829910039902, + "epoch": 8.338892595063376, + "step": 25000 + }, + { + "distill_loss": 0.19284473359584808, + "epoch": 8.338892595063376, + "step": 25000 + }, + { + "epoch": 8.338892595063376, + "ref_ce_loss": 0.07752401381731033, + "step": 25000 + }, + { + "epoch": 8.3422281521014, + "loss": 0.4367, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "grad_norm": 1.536965012550354, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "learning_rate": 5.6285316365604037e-05, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "loss": 0.44820597767829895, + "step": 25010 + }, + { + "ce_loss": 0.07226640731096268, + "epoch": 8.3422281521014, + "step": 25010 + }, + { + "distill_loss": 0.20867325365543365, + "epoch": 8.3422281521014, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "ref_ce_loss": 0.06741487234830856, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "loss": 0.3269725739955902, + "step": 25010 + }, + { + "ce_loss": 0.062072884291410446, + "epoch": 8.3422281521014, + "step": 25010 + }, + { + "distill_loss": 0.17925648391246796, + "epoch": 8.3422281521014, + "step": 25010 + }, + { + "epoch": 8.3422281521014, + "ref_ce_loss": 0.08505354076623917, + "step": 25010 + }, + { + "epoch": 8.345563709139427, + "loss": 0.3696, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "grad_norm": 0.9757553935050964, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "learning_rate": 5.6064484449751145e-05, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "loss": 0.5743061304092407, + "step": 25020 + }, + { + "ce_loss": 0.028917063027620316, + "epoch": 8.345563709139427, + "step": 25020 + }, + { + "distill_loss": 0.19040493667125702, + "epoch": 8.345563709139427, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "ref_ce_loss": 0.072723887860775, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "loss": 0.33696094155311584, + "step": 25020 + }, + { + "ce_loss": 0.08223090320825577, + "epoch": 8.345563709139427, + "step": 25020 + }, + { + "distill_loss": 0.18200477957725525, + "epoch": 8.345563709139427, + "step": 25020 + }, + { + "epoch": 8.345563709139427, + "ref_ce_loss": 0.07257626950740814, + "step": 25020 + }, + { + "epoch": 8.348899266177451, + "loss": 0.377, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "grad_norm": 1.1528825759887695, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "learning_rate": 5.584405394365391e-05, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "loss": 0.35527855157852173, + "step": 25030 + }, + { + "ce_loss": 0.07070120424032211, + "epoch": 8.348899266177451, + "step": 25030 + }, + { + "distill_loss": 0.1744517683982849, + "epoch": 8.348899266177451, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "ref_ce_loss": 0.05900336802005768, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "loss": 0.40178489685058594, + "step": 25030 + }, + { + "ce_loss": 0.05492159724235535, + "epoch": 8.348899266177451, + "step": 25030 + }, + { + "distill_loss": 0.21854011714458466, + "epoch": 8.348899266177451, + "step": 25030 + }, + { + "epoch": 8.348899266177451, + "ref_ce_loss": 0.09786759316921234, + "step": 25030 + }, + { + "epoch": 8.352234823215477, + "loss": 0.4241, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "grad_norm": 2.2214083671569824, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "learning_rate": 5.5624025104578404e-05, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "loss": 0.3765566349029541, + "step": 25040 + }, + { + "ce_loss": 0.05312661454081535, + "epoch": 8.352234823215477, + "step": 25040 + }, + { + "distill_loss": 0.2064882069826126, + "epoch": 8.352234823215477, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "ref_ce_loss": 0.09187427163124084, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "loss": 0.4223754107952118, + "step": 25040 + }, + { + "ce_loss": 0.10373418778181076, + "epoch": 8.352234823215477, + "step": 25040 + }, + { + "distill_loss": 0.20074056088924408, + "epoch": 8.352234823215477, + "step": 25040 + }, + { + "epoch": 8.352234823215477, + "ref_ce_loss": 0.09522716701030731, + "step": 25040 + }, + { + "epoch": 8.355570380253502, + "loss": 0.4533, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "grad_norm": 1.3162434101104736, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "learning_rate": 5.540439818932202e-05, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "loss": 0.5700833797454834, + "step": 25050 + }, + { + "ce_loss": 0.07677993178367615, + "epoch": 8.355570380253502, + "step": 25050 + }, + { + "distill_loss": 0.1616164743900299, + "epoch": 8.355570380253502, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "ref_ce_loss": 0.09277770668268204, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "loss": 0.3516484797000885, + "step": 25050 + }, + { + "ce_loss": 0.04538606479763985, + "epoch": 8.355570380253502, + "step": 25050 + }, + { + "distill_loss": 0.18248510360717773, + "epoch": 8.355570380253502, + "step": 25050 + }, + { + "epoch": 8.355570380253502, + "ref_ce_loss": 0.06455279886722565, + "step": 25050 + }, + { + "epoch": 8.358905937291528, + "loss": 0.4638, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "grad_norm": 1.3298295736312866, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "learning_rate": 5.518517345421304e-05, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "loss": 0.4533049464225769, + "step": 25060 + }, + { + "ce_loss": 0.08425446599721909, + "epoch": 8.358905937291528, + "step": 25060 + }, + { + "distill_loss": 0.2121516764163971, + "epoch": 8.358905937291528, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "ref_ce_loss": 0.08103732019662857, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "loss": 0.3355601131916046, + "step": 25060 + }, + { + "ce_loss": 0.07693885266780853, + "epoch": 8.358905937291528, + "step": 25060 + }, + { + "distill_loss": 0.15625596046447754, + "epoch": 8.358905937291528, + "step": 25060 + }, + { + "epoch": 8.358905937291528, + "ref_ce_loss": 0.07718048989772797, + "step": 25060 + }, + { + "epoch": 8.362241494329552, + "loss": 0.3917, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "grad_norm": 1.6331768035888672, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "learning_rate": 5.496635115511017e-05, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "loss": 0.4228060841560364, + "step": 25070 + }, + { + "ce_loss": 0.04212617501616478, + "epoch": 8.362241494329552, + "step": 25070 + }, + { + "distill_loss": 0.18509574234485626, + "epoch": 8.362241494329552, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "ref_ce_loss": 0.07182523608207703, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "loss": 0.39445173740386963, + "step": 25070 + }, + { + "ce_loss": 0.07941533625125885, + "epoch": 8.362241494329552, + "step": 25070 + }, + { + "distill_loss": 0.18916261196136475, + "epoch": 8.362241494329552, + "step": 25070 + }, + { + "epoch": 8.362241494329552, + "ref_ce_loss": 0.07356342673301697, + "step": 25070 + }, + { + "epoch": 8.365577051367579, + "loss": 0.3803, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "grad_norm": 1.9852405786514282, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "learning_rate": 5.474793154740257e-05, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "loss": 0.30684754252433777, + "step": 25080 + }, + { + "ce_loss": 0.04548907279968262, + "epoch": 8.365577051367579, + "step": 25080 + }, + { + "distill_loss": 0.1826765537261963, + "epoch": 8.365577051367579, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "ref_ce_loss": 0.0782376378774643, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "loss": 0.5661678314208984, + "step": 25080 + }, + { + "ce_loss": 0.07862786203622818, + "epoch": 8.365577051367579, + "step": 25080 + }, + { + "distill_loss": 0.2614433169364929, + "epoch": 8.365577051367579, + "step": 25080 + }, + { + "epoch": 8.365577051367579, + "ref_ce_loss": 0.07493744790554047, + "step": 25080 + }, + { + "epoch": 8.368912608405603, + "loss": 0.3811, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "grad_norm": 0.8753249645233154, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "learning_rate": 5.452991488600967e-05, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "loss": 0.40100419521331787, + "step": 25090 + }, + { + "ce_loss": 0.08868025243282318, + "epoch": 8.368912608405603, + "step": 25090 + }, + { + "distill_loss": 0.21525098383426666, + "epoch": 8.368912608405603, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "ref_ce_loss": 0.07010910660028458, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "loss": 0.30055487155914307, + "step": 25090 + }, + { + "ce_loss": 0.03426504135131836, + "epoch": 8.368912608405603, + "step": 25090 + }, + { + "distill_loss": 0.16961617767810822, + "epoch": 8.368912608405603, + "step": 25090 + }, + { + "epoch": 8.368912608405603, + "ref_ce_loss": 0.07461228221654892, + "step": 25090 + }, + { + "epoch": 8.37224816544363, + "loss": 0.3997, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "grad_norm": 1.5080701112747192, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "learning_rate": 5.431230142538018e-05, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "loss": 0.3572412431240082, + "step": 25100 + }, + { + "ce_loss": 0.04778499901294708, + "epoch": 8.37224816544363, + "step": 25100 + }, + { + "distill_loss": 0.18236097693443298, + "epoch": 8.37224816544363, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "ref_ce_loss": 0.09740842133760452, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "loss": 0.334131121635437, + "step": 25100 + }, + { + "ce_loss": 0.04791150614619255, + "epoch": 8.37224816544363, + "step": 25100 + }, + { + "distill_loss": 0.16604946553707123, + "epoch": 8.37224816544363, + "step": 25100 + }, + { + "epoch": 8.37224816544363, + "ref_ce_loss": 0.055136483162641525, + "step": 25100 + }, + { + "epoch": 8.375583722481654, + "loss": 0.3765, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "grad_norm": 1.1067456007003784, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "learning_rate": 5.409509141949243e-05, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "loss": 0.3909205496311188, + "step": 25110 + }, + { + "ce_loss": 0.07044972479343414, + "epoch": 8.375583722481654, + "step": 25110 + }, + { + "distill_loss": 0.16520169377326965, + "epoch": 8.375583722481654, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "ref_ce_loss": 0.09196219593286514, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "loss": 0.4352046549320221, + "step": 25110 + }, + { + "ce_loss": 0.08850784599781036, + "epoch": 8.375583722481654, + "step": 25110 + }, + { + "distill_loss": 0.24338781833648682, + "epoch": 8.375583722481654, + "step": 25110 + }, + { + "epoch": 8.375583722481654, + "ref_ce_loss": 0.0809473991394043, + "step": 25110 + }, + { + "epoch": 8.37891927951968, + "loss": 0.375, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "grad_norm": 2.305718421936035, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "learning_rate": 5.387828512185387e-05, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "loss": 0.3594679832458496, + "step": 25120 + }, + { + "ce_loss": 0.058741405606269836, + "epoch": 8.37891927951968, + "step": 25120 + }, + { + "distill_loss": 0.17034927010536194, + "epoch": 8.37891927951968, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "ref_ce_loss": 0.09841496497392654, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "loss": 0.6309322118759155, + "step": 25120 + }, + { + "ce_loss": 0.0625331699848175, + "epoch": 8.37891927951968, + "step": 25120 + }, + { + "distill_loss": 0.22987818717956543, + "epoch": 8.37891927951968, + "step": 25120 + }, + { + "epoch": 8.37891927951968, + "ref_ce_loss": 0.07687834650278091, + "step": 25120 + }, + { + "epoch": 8.382254836557705, + "loss": 0.3689, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "grad_norm": 1.0419330596923828, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "learning_rate": 5.366188278550093e-05, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "loss": 0.5490450263023376, + "step": 25130 + }, + { + "ce_loss": 0.09026575088500977, + "epoch": 8.382254836557705, + "step": 25130 + }, + { + "distill_loss": 0.27481669187545776, + "epoch": 8.382254836557705, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "ref_ce_loss": 0.11525463312864304, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "loss": 0.42957377433776855, + "step": 25130 + }, + { + "ce_loss": 0.07263614237308502, + "epoch": 8.382254836557705, + "step": 25130 + }, + { + "distill_loss": 0.21777136623859406, + "epoch": 8.382254836557705, + "step": 25130 + }, + { + "epoch": 8.382254836557705, + "ref_ce_loss": 0.08512672781944275, + "step": 25130 + }, + { + "epoch": 8.38559039359573, + "loss": 0.4376, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "grad_norm": 6.886488914489746, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "learning_rate": 5.344588466299825e-05, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "loss": 0.3433779776096344, + "step": 25140 + }, + { + "ce_loss": 0.05504188314080238, + "epoch": 8.38559039359573, + "step": 25140 + }, + { + "distill_loss": 0.2048121690750122, + "epoch": 8.38559039359573, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "ref_ce_loss": 0.058514852076768875, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "loss": 0.31423816084861755, + "step": 25140 + }, + { + "ce_loss": 0.033430345356464386, + "epoch": 8.38559039359573, + "step": 25140 + }, + { + "distill_loss": 0.16572749614715576, + "epoch": 8.38559039359573, + "step": 25140 + }, + { + "epoch": 8.38559039359573, + "ref_ce_loss": 0.08138911426067352, + "step": 25140 + }, + { + "epoch": 8.388925950633755, + "loss": 0.3877, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "grad_norm": 1.519091248512268, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "learning_rate": 5.323029100643888e-05, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "loss": 0.47047603130340576, + "step": 25150 + }, + { + "ce_loss": 0.10079821199178696, + "epoch": 8.388925950633755, + "step": 25150 + }, + { + "distill_loss": 0.2616922855377197, + "epoch": 8.388925950633755, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "ref_ce_loss": 0.07919727265834808, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "loss": 0.37185871601104736, + "step": 25150 + }, + { + "ce_loss": 0.03542047366499901, + "epoch": 8.388925950633755, + "step": 25150 + }, + { + "distill_loss": 0.1679682582616806, + "epoch": 8.388925950633755, + "step": 25150 + }, + { + "epoch": 8.388925950633755, + "ref_ce_loss": 0.05478193610906601, + "step": 25150 + }, + { + "epoch": 8.392261507671781, + "loss": 0.412, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "grad_norm": 1.5826023817062378, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "learning_rate": 5.301510206744391e-05, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "loss": 0.4558910131454468, + "step": 25160 + }, + { + "ce_loss": 0.08379810303449631, + "epoch": 8.392261507671781, + "step": 25160 + }, + { + "distill_loss": 0.26472505927085876, + "epoch": 8.392261507671781, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "ref_ce_loss": 0.08704212307929993, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "loss": 0.331781804561615, + "step": 25160 + }, + { + "ce_loss": 0.05914212018251419, + "epoch": 8.392261507671781, + "step": 25160 + }, + { + "distill_loss": 0.18458394706249237, + "epoch": 8.392261507671781, + "step": 25160 + }, + { + "epoch": 8.392261507671781, + "ref_ce_loss": 0.0879281759262085, + "step": 25160 + }, + { + "epoch": 8.395597064709806, + "loss": 0.4502, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "grad_norm": 1.1389052867889404, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "learning_rate": 5.2800318097161997e-05, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "loss": 0.4669540226459503, + "step": 25170 + }, + { + "ce_loss": 0.08764485269784927, + "epoch": 8.395597064709806, + "step": 25170 + }, + { + "distill_loss": 0.2544141709804535, + "epoch": 8.395597064709806, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "ref_ce_loss": 0.0960792750120163, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "loss": 0.5467636585235596, + "step": 25170 + }, + { + "ce_loss": 0.0679205060005188, + "epoch": 8.395597064709806, + "step": 25170 + }, + { + "distill_loss": 0.18445956707000732, + "epoch": 8.395597064709806, + "step": 25170 + }, + { + "epoch": 8.395597064709806, + "ref_ce_loss": 0.10022291541099548, + "step": 25170 + }, + { + "epoch": 8.398932621747832, + "loss": 0.4217, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "grad_norm": 1.0912070274353027, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "learning_rate": 5.2585939346269055e-05, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "loss": 0.44236764311790466, + "step": 25180 + }, + { + "ce_loss": 0.07659192383289337, + "epoch": 8.398932621747832, + "step": 25180 + }, + { + "distill_loss": 0.27285832166671753, + "epoch": 8.398932621747832, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "ref_ce_loss": 0.09274059534072876, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "loss": 0.3482986092567444, + "step": 25180 + }, + { + "ce_loss": 0.041937630623579025, + "epoch": 8.398932621747832, + "step": 25180 + }, + { + "distill_loss": 0.19385190308094025, + "epoch": 8.398932621747832, + "step": 25180 + }, + { + "epoch": 8.398932621747832, + "ref_ce_loss": 0.07759872823953629, + "step": 25180 + }, + { + "epoch": 8.402268178785857, + "loss": 0.4, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "grad_norm": 1.6916354894638062, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "learning_rate": 5.237196606496806e-05, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "loss": 0.3225415349006653, + "step": 25190 + }, + { + "ce_loss": 0.05989629775285721, + "epoch": 8.402268178785857, + "step": 25190 + }, + { + "distill_loss": 0.18907451629638672, + "epoch": 8.402268178785857, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "ref_ce_loss": 0.07331027090549469, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "loss": 0.5849019289016724, + "step": 25190 + }, + { + "ce_loss": 0.061337001621723175, + "epoch": 8.402268178785857, + "step": 25190 + }, + { + "distill_loss": 0.302169531583786, + "epoch": 8.402268178785857, + "step": 25190 + }, + { + "epoch": 8.402268178785857, + "ref_ce_loss": 0.09328872710466385, + "step": 25190 + }, + { + "epoch": 8.405603735823883, + "loss": 0.4324, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "grad_norm": 1.3808259963989258, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "learning_rate": 5.2158398502989116e-05, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "loss": 0.3421984612941742, + "step": 25200 + }, + { + "ce_loss": 0.07046947628259659, + "epoch": 8.405603735823883, + "step": 25200 + }, + { + "distill_loss": 0.17851309478282928, + "epoch": 8.405603735823883, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "ref_ce_loss": 0.06344214826822281, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "loss": 0.236195906996727, + "step": 25200 + }, + { + "ce_loss": 0.0421786792576313, + "epoch": 8.405603735823883, + "step": 25200 + }, + { + "distill_loss": 0.1386895775794983, + "epoch": 8.405603735823883, + "step": 25200 + }, + { + "epoch": 8.405603735823883, + "ref_ce_loss": 0.055049993097782135, + "step": 25200 + }, + { + "epoch": 8.408939292861907, + "loss": 0.3882, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "grad_norm": 0.9948466420173645, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "learning_rate": 5.194523690958848e-05, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "loss": 0.40159645676612854, + "step": 25210 + }, + { + "ce_loss": 0.053242526948451996, + "epoch": 8.408939292861907, + "step": 25210 + }, + { + "distill_loss": 0.1849006712436676, + "epoch": 8.408939292861907, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "ref_ce_loss": 0.06335292011499405, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "loss": 0.4151378571987152, + "step": 25210 + }, + { + "ce_loss": 0.05086888372898102, + "epoch": 8.408939292861907, + "step": 25210 + }, + { + "distill_loss": 0.17231523990631104, + "epoch": 8.408939292861907, + "step": 25210 + }, + { + "epoch": 8.408939292861907, + "ref_ce_loss": 0.07796960324048996, + "step": 25210 + }, + { + "epoch": 8.412274849899934, + "loss": 0.3849, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "grad_norm": 1.9043771028518677, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "learning_rate": 5.173248153354853e-05, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "loss": 0.43315085768699646, + "step": 25220 + }, + { + "ce_loss": 0.09647293388843536, + "epoch": 8.412274849899934, + "step": 25220 + }, + { + "distill_loss": 0.2354562133550644, + "epoch": 8.412274849899934, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "ref_ce_loss": 0.07499422878026962, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "loss": 0.32602351903915405, + "step": 25220 + }, + { + "ce_loss": 0.029574042186141014, + "epoch": 8.412274849899934, + "step": 25220 + }, + { + "distill_loss": 0.16238999366760254, + "epoch": 8.412274849899934, + "step": 25220 + }, + { + "epoch": 8.412274849899934, + "ref_ce_loss": 0.05871171876788139, + "step": 25220 + }, + { + "epoch": 8.415610406937958, + "loss": 0.3866, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "grad_norm": 1.129930019378662, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "learning_rate": 5.152013262317779e-05, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "loss": 0.4822864532470703, + "step": 25230 + }, + { + "ce_loss": 0.060090694576501846, + "epoch": 8.415610406937958, + "step": 25230 + }, + { + "distill_loss": 0.2543797791004181, + "epoch": 8.415610406937958, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "ref_ce_loss": 0.0755380168557167, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "loss": 0.4784907102584839, + "step": 25230 + }, + { + "ce_loss": 0.05570368841290474, + "epoch": 8.415610406937958, + "step": 25230 + }, + { + "distill_loss": 0.23093335330486298, + "epoch": 8.415610406937958, + "step": 25230 + }, + { + "epoch": 8.415610406937958, + "ref_ce_loss": 0.09662224352359772, + "step": 25230 + }, + { + "epoch": 8.418945963975984, + "loss": 0.3674, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "grad_norm": 1.370429515838623, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "learning_rate": 5.130819042631023e-05, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "loss": 0.3596634864807129, + "step": 25240 + }, + { + "ce_loss": 0.044010650366544724, + "epoch": 8.418945963975984, + "step": 25240 + }, + { + "distill_loss": 0.1983024775981903, + "epoch": 8.418945963975984, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "ref_ce_loss": 0.08885973691940308, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "loss": 0.49388378858566284, + "step": 25240 + }, + { + "ce_loss": 0.07588797807693481, + "epoch": 8.418945963975984, + "step": 25240 + }, + { + "distill_loss": 0.19174428284168243, + "epoch": 8.418945963975984, + "step": 25240 + }, + { + "epoch": 8.418945963975984, + "ref_ce_loss": 0.10179644078016281, + "step": 25240 + }, + { + "epoch": 8.422281521014009, + "loss": 0.4296, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "grad_norm": 1.163691759109497, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "learning_rate": 5.109665519030538e-05, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "loss": 0.3523298501968384, + "step": 25250 + }, + { + "ce_loss": 0.06145976111292839, + "epoch": 8.422281521014009, + "step": 25250 + }, + { + "distill_loss": 0.1971292793750763, + "epoch": 8.422281521014009, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "ref_ce_loss": 0.07176455110311508, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "loss": 0.2701820433139801, + "step": 25250 + }, + { + "ce_loss": 0.02976236678659916, + "epoch": 8.422281521014009, + "step": 25250 + }, + { + "distill_loss": 0.14251725375652313, + "epoch": 8.422281521014009, + "step": 25250 + }, + { + "epoch": 8.422281521014009, + "ref_ce_loss": 0.06620508432388306, + "step": 25250 + }, + { + "epoch": 8.425617078052035, + "loss": 0.3815, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "grad_norm": 0.999703049659729, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "learning_rate": 5.088552716204733e-05, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "loss": 0.2414764016866684, + "step": 25260 + }, + { + "ce_loss": 0.052632272243499756, + "epoch": 8.425617078052035, + "step": 25260 + }, + { + "distill_loss": 0.11714950948953629, + "epoch": 8.425617078052035, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "ref_ce_loss": 0.07144191116094589, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "loss": 0.4351324141025543, + "step": 25260 + }, + { + "ce_loss": 0.07600032538175583, + "epoch": 8.425617078052035, + "step": 25260 + }, + { + "distill_loss": 0.19265899062156677, + "epoch": 8.425617078052035, + "step": 25260 + }, + { + "epoch": 8.425617078052035, + "ref_ce_loss": 0.06840462237596512, + "step": 25260 + }, + { + "epoch": 8.42895263509006, + "loss": 0.4025, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "grad_norm": 1.1116409301757812, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "learning_rate": 5.067480658794539e-05, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "loss": 0.4140152037143707, + "step": 25270 + }, + { + "ce_loss": 0.06640651077032089, + "epoch": 8.42895263509006, + "step": 25270 + }, + { + "distill_loss": 0.21821719408035278, + "epoch": 8.42895263509006, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "ref_ce_loss": 0.06510384380817413, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "loss": 0.3618549406528473, + "step": 25270 + }, + { + "ce_loss": 0.028401054441928864, + "epoch": 8.42895263509006, + "step": 25270 + }, + { + "distill_loss": 0.1603056788444519, + "epoch": 8.42895263509006, + "step": 25270 + }, + { + "epoch": 8.42895263509006, + "ref_ce_loss": 0.061352767050266266, + "step": 25270 + }, + { + "epoch": 8.432288192128086, + "loss": 0.3696, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "grad_norm": 1.4946244955062866, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "learning_rate": 5.046449371393309e-05, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "loss": 0.6138637065887451, + "step": 25280 + }, + { + "ce_loss": 0.06309738010168076, + "epoch": 8.432288192128086, + "step": 25280 + }, + { + "distill_loss": 0.21630330383777618, + "epoch": 8.432288192128086, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "ref_ce_loss": 0.08831988275051117, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "loss": 0.691237211227417, + "step": 25280 + }, + { + "ce_loss": 0.05093845725059509, + "epoch": 8.432288192128086, + "step": 25280 + }, + { + "distill_loss": 0.19174456596374512, + "epoch": 8.432288192128086, + "step": 25280 + }, + { + "epoch": 8.432288192128086, + "ref_ce_loss": 0.05205420404672623, + "step": 25280 + }, + { + "epoch": 8.43562374916611, + "loss": 0.4186, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "grad_norm": 1.1311005353927612, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "learning_rate": 5.0254588785468274e-05, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "loss": 0.27860555052757263, + "step": 25290 + }, + { + "ce_loss": 0.05554281175136566, + "epoch": 8.43562374916611, + "step": 25290 + }, + { + "distill_loss": 0.1556360125541687, + "epoch": 8.43562374916611, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "ref_ce_loss": 0.06723165512084961, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "loss": 0.38482168316841125, + "step": 25290 + }, + { + "ce_loss": 0.044632989913225174, + "epoch": 8.43562374916611, + "step": 25290 + }, + { + "distill_loss": 0.15955740213394165, + "epoch": 8.43562374916611, + "step": 25290 + }, + { + "epoch": 8.43562374916611, + "ref_ce_loss": 0.07114876061677933, + "step": 25290 + }, + { + "epoch": 8.438959306204136, + "loss": 0.3979, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "grad_norm": 1.1892763376235962, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "learning_rate": 5.004509204753238e-05, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "loss": 0.3521316945552826, + "step": 25300 + }, + { + "ce_loss": 0.09186404198408127, + "epoch": 8.438959306204136, + "step": 25300 + }, + { + "distill_loss": 0.16879168152809143, + "epoch": 8.438959306204136, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "ref_ce_loss": 0.09131795912981033, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "loss": 0.45177480578422546, + "step": 25300 + }, + { + "ce_loss": 0.07362983375787735, + "epoch": 8.438959306204136, + "step": 25300 + }, + { + "distill_loss": 0.25475966930389404, + "epoch": 8.438959306204136, + "step": 25300 + }, + { + "epoch": 8.438959306204136, + "ref_ce_loss": 0.09969697892665863, + "step": 25300 + }, + { + "epoch": 8.44229486324216, + "loss": 0.4684, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "grad_norm": 2.0051913261413574, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "learning_rate": 4.983600374463082e-05, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "loss": 0.4478938579559326, + "step": 25310 + }, + { + "ce_loss": 0.09970908612012863, + "epoch": 8.44229486324216, + "step": 25310 + }, + { + "distill_loss": 0.22506193816661835, + "epoch": 8.44229486324216, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "ref_ce_loss": 0.06599080562591553, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "loss": 0.284869521856308, + "step": 25310 + }, + { + "ce_loss": 0.02272508665919304, + "epoch": 8.44229486324216, + "step": 25310 + }, + { + "distill_loss": 0.1496310979127884, + "epoch": 8.44229486324216, + "step": 25310 + }, + { + "epoch": 8.44229486324216, + "ref_ce_loss": 0.05696079134941101, + "step": 25310 + }, + { + "epoch": 8.445630420280187, + "loss": 0.3802, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "grad_norm": 1.4640692472457886, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "learning_rate": 4.962732412079221e-05, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "loss": 0.4311991035938263, + "step": 25320 + }, + { + "ce_loss": 0.08195716887712479, + "epoch": 8.445630420280187, + "step": 25320 + }, + { + "distill_loss": 0.23977629840373993, + "epoch": 8.445630420280187, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "ref_ce_loss": 0.07493677735328674, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "loss": 0.3129364550113678, + "step": 25320 + }, + { + "ce_loss": 0.03413569554686546, + "epoch": 8.445630420280187, + "step": 25320 + }, + { + "distill_loss": 0.19769978523254395, + "epoch": 8.445630420280187, + "step": 25320 + }, + { + "epoch": 8.445630420280187, + "ref_ce_loss": 0.08082103729248047, + "step": 25320 + }, + { + "epoch": 8.448965977318212, + "loss": 0.3941, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "grad_norm": 2.3397746086120605, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "learning_rate": 4.94190534195679e-05, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "loss": 0.3372156620025635, + "step": 25330 + }, + { + "ce_loss": 0.05968444049358368, + "epoch": 8.448965977318212, + "step": 25330 + }, + { + "distill_loss": 0.1731799691915512, + "epoch": 8.448965977318212, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "ref_ce_loss": 0.06552577018737793, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "loss": 1.0824205875396729, + "step": 25330 + }, + { + "ce_loss": 0.13243527710437775, + "epoch": 8.448965977318212, + "step": 25330 + }, + { + "distill_loss": 0.29550617933273315, + "epoch": 8.448965977318212, + "step": 25330 + }, + { + "epoch": 8.448965977318212, + "ref_ce_loss": 0.10851672291755676, + "step": 25330 + }, + { + "epoch": 8.452301534356238, + "loss": 0.4373, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "grad_norm": 1.4478459358215332, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "learning_rate": 4.921119188403234e-05, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "loss": 0.35270383954048157, + "step": 25340 + }, + { + "ce_loss": 0.02878500334918499, + "epoch": 8.452301534356238, + "step": 25340 + }, + { + "distill_loss": 0.15584421157836914, + "epoch": 8.452301534356238, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "ref_ce_loss": 0.057748857885599136, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "loss": 0.3954959213733673, + "step": 25340 + }, + { + "ce_loss": 0.0580030120909214, + "epoch": 8.452301534356238, + "step": 25340 + }, + { + "distill_loss": 0.16948223114013672, + "epoch": 8.452301534356238, + "step": 25340 + }, + { + "epoch": 8.452301534356238, + "ref_ce_loss": 0.06833826750516891, + "step": 25340 + }, + { + "epoch": 8.455637091394262, + "loss": 0.3913, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "grad_norm": 2.2257843017578125, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "learning_rate": 4.900373975678227e-05, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "loss": 0.4083612561225891, + "step": 25350 + }, + { + "ce_loss": 0.09540177881717682, + "epoch": 8.455637091394262, + "step": 25350 + }, + { + "distill_loss": 0.1899162232875824, + "epoch": 8.455637091394262, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "ref_ce_loss": 0.08591882884502411, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "loss": 0.4242921769618988, + "step": 25350 + }, + { + "ce_loss": 0.054686613380908966, + "epoch": 8.455637091394262, + "step": 25350 + }, + { + "distill_loss": 0.1933739185333252, + "epoch": 8.455637091394262, + "step": 25350 + }, + { + "epoch": 8.455637091394262, + "ref_ce_loss": 0.08469750732183456, + "step": 25350 + }, + { + "epoch": 8.458972648432288, + "loss": 0.4048, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "grad_norm": 1.147485375404358, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "learning_rate": 4.8796697279936784e-05, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "loss": 0.4345652461051941, + "step": 25360 + }, + { + "ce_loss": 0.04452275484800339, + "epoch": 8.458972648432288, + "step": 25360 + }, + { + "distill_loss": 0.20094962418079376, + "epoch": 8.458972648432288, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "ref_ce_loss": 0.05491969734430313, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "loss": 0.3355385959148407, + "step": 25360 + }, + { + "ce_loss": 0.041421085596084595, + "epoch": 8.458972648432288, + "step": 25360 + }, + { + "distill_loss": 0.2158370316028595, + "epoch": 8.458972648432288, + "step": 25360 + }, + { + "epoch": 8.458972648432288, + "ref_ce_loss": 0.07798238098621368, + "step": 25360 + }, + { + "epoch": 8.462308205470313, + "loss": 0.4442, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "grad_norm": 1.0965797901153564, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "learning_rate": 4.8590064695136496e-05, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "loss": 0.22678275406360626, + "step": 25370 + }, + { + "ce_loss": 0.018734248355031013, + "epoch": 8.462308205470313, + "step": 25370 + }, + { + "distill_loss": 0.13638636469841003, + "epoch": 8.462308205470313, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "ref_ce_loss": 0.04462753236293793, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "loss": 0.35289931297302246, + "step": 25370 + }, + { + "ce_loss": 0.0470467135310173, + "epoch": 8.462308205470313, + "step": 25370 + }, + { + "distill_loss": 0.17710834741592407, + "epoch": 8.462308205470313, + "step": 25370 + }, + { + "epoch": 8.462308205470313, + "ref_ce_loss": 0.05419645085930824, + "step": 25370 + }, + { + "epoch": 8.46564376250834, + "loss": 0.4331, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "grad_norm": 1.1017558574676514, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "learning_rate": 4.8383842243544e-05, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "loss": 0.5567941665649414, + "step": 25380 + }, + { + "ce_loss": 0.061687298119068146, + "epoch": 8.46564376250834, + "step": 25380 + }, + { + "distill_loss": 0.2658335566520691, + "epoch": 8.46564376250834, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "ref_ce_loss": 0.08968020230531693, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "loss": 0.5644046664237976, + "step": 25380 + }, + { + "ce_loss": 0.08603052794933319, + "epoch": 8.46564376250834, + "step": 25380 + }, + { + "distill_loss": 0.23047250509262085, + "epoch": 8.46564376250834, + "step": 25380 + }, + { + "epoch": 8.46564376250834, + "ref_ce_loss": 0.08169778436422348, + "step": 25380 + }, + { + "epoch": 8.468979319546364, + "loss": 0.4409, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "grad_norm": 1.4107959270477295, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "learning_rate": 4.8178030165843034e-05, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "loss": 0.4246959090232849, + "step": 25390 + }, + { + "ce_loss": 0.07812031358480453, + "epoch": 8.468979319546364, + "step": 25390 + }, + { + "distill_loss": 0.2077268809080124, + "epoch": 8.468979319546364, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "ref_ce_loss": 0.06722721457481384, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "loss": 0.4788954257965088, + "step": 25390 + }, + { + "ce_loss": 0.11047182232141495, + "epoch": 8.468979319546364, + "step": 25390 + }, + { + "distill_loss": 0.2312658578157425, + "epoch": 8.468979319546364, + "step": 25390 + }, + { + "epoch": 8.468979319546364, + "ref_ce_loss": 0.13693365454673767, + "step": 25390 + }, + { + "epoch": 8.47231487658439, + "loss": 0.4272, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "grad_norm": 1.23094642162323, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "learning_rate": 4.7972628702238484e-05, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "loss": 0.34833505749702454, + "step": 25400 + }, + { + "ce_loss": 0.09293520450592041, + "epoch": 8.47231487658439, + "step": 25400 + }, + { + "distill_loss": 0.1796901375055313, + "epoch": 8.47231487658439, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "ref_ce_loss": 0.07550303637981415, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "loss": 0.4009092450141907, + "step": 25400 + }, + { + "ce_loss": 0.05195685103535652, + "epoch": 8.47231487658439, + "step": 25400 + }, + { + "distill_loss": 0.17379993200302124, + "epoch": 8.47231487658439, + "step": 25400 + }, + { + "epoch": 8.47231487658439, + "ref_ce_loss": 0.09474589675664902, + "step": 25400 + }, + { + "epoch": 8.475650433622414, + "loss": 0.3689, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "grad_norm": 1.3589696884155273, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "learning_rate": 4.776763809245597e-05, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "loss": 0.3397957980632782, + "step": 25410 + }, + { + "ce_loss": 0.05533874034881592, + "epoch": 8.475650433622414, + "step": 25410 + }, + { + "distill_loss": 0.1839795559644699, + "epoch": 8.475650433622414, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "ref_ce_loss": 0.06962695717811584, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "loss": 0.4177882671356201, + "step": 25410 + }, + { + "ce_loss": 0.056037165224552155, + "epoch": 8.475650433622414, + "step": 25410 + }, + { + "distill_loss": 0.1790320724248886, + "epoch": 8.475650433622414, + "step": 25410 + }, + { + "epoch": 8.475650433622414, + "ref_ce_loss": 0.07811678200960159, + "step": 25410 + }, + { + "epoch": 8.47898599066044, + "loss": 0.394, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "grad_norm": 0.9545964598655701, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "learning_rate": 4.756305857574157e-05, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "loss": 0.4571429193019867, + "step": 25420 + }, + { + "ce_loss": 0.1148986965417862, + "epoch": 8.47898599066044, + "step": 25420 + }, + { + "distill_loss": 0.23694494366645813, + "epoch": 8.47898599066044, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "ref_ce_loss": 0.0779845267534256, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "loss": 0.37947702407836914, + "step": 25420 + }, + { + "ce_loss": 0.035717274993658066, + "epoch": 8.47898599066044, + "step": 25420 + }, + { + "distill_loss": 0.1585291177034378, + "epoch": 8.47898599066044, + "step": 25420 + }, + { + "epoch": 8.47898599066044, + "ref_ce_loss": 0.08000072836875916, + "step": 25420 + }, + { + "epoch": 8.482321547698465, + "loss": 0.3691, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "grad_norm": 1.3420974016189575, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "learning_rate": 4.735889039086163e-05, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "loss": 0.4631466865539551, + "step": 25430 + }, + { + "ce_loss": 0.11675658822059631, + "epoch": 8.482321547698465, + "step": 25430 + }, + { + "distill_loss": 0.2317403107881546, + "epoch": 8.482321547698465, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "ref_ce_loss": 0.08134746551513672, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "loss": 0.37111377716064453, + "step": 25430 + }, + { + "ce_loss": 0.08108817040920258, + "epoch": 8.482321547698465, + "step": 25430 + }, + { + "distill_loss": 0.19684088230133057, + "epoch": 8.482321547698465, + "step": 25430 + }, + { + "epoch": 8.482321547698465, + "ref_ce_loss": 0.07465078681707382, + "step": 25430 + }, + { + "epoch": 8.485657104736491, + "loss": 0.4052, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "grad_norm": 1.8001788854599, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "learning_rate": 4.715513377610239e-05, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "loss": 0.42932698130607605, + "step": 25440 + }, + { + "ce_loss": 0.061111774295568466, + "epoch": 8.485657104736491, + "step": 25440 + }, + { + "distill_loss": 0.2266315519809723, + "epoch": 8.485657104736491, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "ref_ce_loss": 0.06741586327552795, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "loss": 0.30838507413864136, + "step": 25440 + }, + { + "ce_loss": 0.05005405843257904, + "epoch": 8.485657104736491, + "step": 25440 + }, + { + "distill_loss": 0.1564617156982422, + "epoch": 8.485657104736491, + "step": 25440 + }, + { + "epoch": 8.485657104736491, + "ref_ce_loss": 0.07379646599292755, + "step": 25440 + }, + { + "epoch": 8.488992661774516, + "loss": 0.3964, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "grad_norm": 0.9869126677513123, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "learning_rate": 4.695178896926966e-05, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "loss": 0.4669387936592102, + "step": 25450 + }, + { + "ce_loss": 0.11110807955265045, + "epoch": 8.488992661774516, + "step": 25450 + }, + { + "distill_loss": 0.23916535079479218, + "epoch": 8.488992661774516, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "ref_ce_loss": 0.07803558558225632, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "loss": 0.3536241054534912, + "step": 25450 + }, + { + "ce_loss": 0.09098926186561584, + "epoch": 8.488992661774516, + "step": 25450 + }, + { + "distill_loss": 0.17528116703033447, + "epoch": 8.488992661774516, + "step": 25450 + }, + { + "epoch": 8.488992661774516, + "ref_ce_loss": 0.08723273873329163, + "step": 25450 + }, + { + "epoch": 8.492328218812542, + "loss": 0.4308, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "grad_norm": 1.1579896211624146, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "learning_rate": 4.674885620768872e-05, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "loss": 0.3379400670528412, + "step": 25460 + }, + { + "ce_loss": 0.057282764464616776, + "epoch": 8.492328218812542, + "step": 25460 + }, + { + "distill_loss": 0.19009092450141907, + "epoch": 8.492328218812542, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "ref_ce_loss": 0.06612122058868408, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "loss": 0.4416944980621338, + "step": 25460 + }, + { + "ce_loss": 0.08713212609291077, + "epoch": 8.492328218812542, + "step": 25460 + }, + { + "distill_loss": 0.22129622101783752, + "epoch": 8.492328218812542, + "step": 25460 + }, + { + "epoch": 8.492328218812542, + "ref_ce_loss": 0.09360598027706146, + "step": 25460 + }, + { + "epoch": 8.495663775850566, + "loss": 0.4013, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "grad_norm": 1.344419002532959, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "learning_rate": 4.654633572820402e-05, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "loss": 0.4243669807910919, + "step": 25470 + }, + { + "ce_loss": 0.07619713246822357, + "epoch": 8.495663775850566, + "step": 25470 + }, + { + "distill_loss": 0.19245898723602295, + "epoch": 8.495663775850566, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "ref_ce_loss": 0.0890323594212532, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "loss": 0.39107850193977356, + "step": 25470 + }, + { + "ce_loss": 0.06887102127075195, + "epoch": 8.495663775850566, + "step": 25470 + }, + { + "distill_loss": 0.18017184734344482, + "epoch": 8.495663775850566, + "step": 25470 + }, + { + "epoch": 8.495663775850566, + "ref_ce_loss": 0.08554864674806595, + "step": 25470 + }, + { + "epoch": 8.498999332888593, + "loss": 0.4082, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "grad_norm": 1.3751683235168457, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "learning_rate": 4.634422776717879e-05, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "loss": 0.3181149363517761, + "step": 25480 + }, + { + "ce_loss": 0.052022214978933334, + "epoch": 8.498999332888593, + "step": 25480 + }, + { + "distill_loss": 0.18051841855049133, + "epoch": 8.498999332888593, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "ref_ce_loss": 0.08533704280853271, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "loss": 0.47193965315818787, + "step": 25480 + }, + { + "ce_loss": 0.024297937750816345, + "epoch": 8.498999332888593, + "step": 25480 + }, + { + "distill_loss": 0.2265244424343109, + "epoch": 8.498999332888593, + "step": 25480 + }, + { + "epoch": 8.498999332888593, + "ref_ce_loss": 0.05670903995633125, + "step": 25480 + }, + { + "epoch": 8.502334889926617, + "loss": 0.3771, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "grad_norm": 1.2867672443389893, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "learning_rate": 4.614253256049459e-05, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "loss": 0.33566558361053467, + "step": 25490 + }, + { + "ce_loss": 0.06634427607059479, + "epoch": 8.502334889926617, + "step": 25490 + }, + { + "distill_loss": 0.16398048400878906, + "epoch": 8.502334889926617, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "ref_ce_loss": 0.07367944717407227, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "loss": 0.5042216777801514, + "step": 25490 + }, + { + "ce_loss": 0.05847841128706932, + "epoch": 8.502334889926617, + "step": 25490 + }, + { + "distill_loss": 0.1794266402721405, + "epoch": 8.502334889926617, + "step": 25490 + }, + { + "epoch": 8.502334889926617, + "ref_ce_loss": 0.08029765635728836, + "step": 25490 + }, + { + "epoch": 8.505670446964643, + "loss": 0.4017, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "grad_norm": 1.5184662342071533, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "learning_rate": 4.5941250343551546e-05, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "loss": 0.39854538440704346, + "step": 25500 + }, + { + "ce_loss": 0.06465799361467361, + "epoch": 8.505670446964643, + "step": 25500 + }, + { + "distill_loss": 0.20663900673389435, + "epoch": 8.505670446964643, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "ref_ce_loss": 0.06514228135347366, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "loss": 0.38145771622657776, + "step": 25500 + }, + { + "ce_loss": 0.040179524570703506, + "epoch": 8.505670446964643, + "step": 25500 + }, + { + "distill_loss": 0.22532419860363007, + "epoch": 8.505670446964643, + "step": 25500 + }, + { + "epoch": 8.505670446964643, + "ref_ce_loss": 0.0818011611700058, + "step": 25500 + }, + { + "epoch": 8.509006004002668, + "loss": 0.3792, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "grad_norm": 0.836586594581604, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "learning_rate": 4.574038135126766e-05, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "loss": 0.3313855826854706, + "step": 25510 + }, + { + "ce_loss": 0.04506858065724373, + "epoch": 8.509006004002668, + "step": 25510 + }, + { + "distill_loss": 0.18170565366744995, + "epoch": 8.509006004002668, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "ref_ce_loss": 0.08849178999662399, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "loss": 0.44603872299194336, + "step": 25510 + }, + { + "ce_loss": 0.034600429236888885, + "epoch": 8.509006004002668, + "step": 25510 + }, + { + "distill_loss": 0.13978897035121918, + "epoch": 8.509006004002668, + "step": 25510 + }, + { + "epoch": 8.509006004002668, + "ref_ce_loss": 0.06843680888414383, + "step": 25510 + }, + { + "epoch": 8.512341561040694, + "loss": 0.4108, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "grad_norm": 2.0533783435821533, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "learning_rate": 4.5539925818078646e-05, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "loss": 0.31537896394729614, + "step": 25520 + }, + { + "ce_loss": 0.0696769580245018, + "epoch": 8.512341561040694, + "step": 25520 + }, + { + "distill_loss": 0.1834687441587448, + "epoch": 8.512341561040694, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "ref_ce_loss": 0.04670481011271477, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "loss": 0.9505656957626343, + "step": 25520 + }, + { + "ce_loss": 0.1394726186990738, + "epoch": 8.512341561040694, + "step": 25520 + }, + { + "distill_loss": 0.2532227635383606, + "epoch": 8.512341561040694, + "step": 25520 + }, + { + "epoch": 8.512341561040694, + "ref_ce_loss": 0.07898080348968506, + "step": 25520 + }, + { + "epoch": 8.515677118078719, + "loss": 0.4574, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "grad_norm": 1.2700072526931763, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "learning_rate": 4.533988397793767e-05, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "loss": 0.4377422034740448, + "step": 25530 + }, + { + "ce_loss": 0.03457576036453247, + "epoch": 8.515677118078719, + "step": 25530 + }, + { + "distill_loss": 0.2131364643573761, + "epoch": 8.515677118078719, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "ref_ce_loss": 0.09452904015779495, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "loss": 0.2893883287906647, + "step": 25530 + }, + { + "ce_loss": 0.03002668172121048, + "epoch": 8.515677118078719, + "step": 25530 + }, + { + "distill_loss": 0.1513487845659256, + "epoch": 8.515677118078719, + "step": 25530 + }, + { + "epoch": 8.515677118078719, + "ref_ce_loss": 0.10772596299648285, + "step": 25530 + }, + { + "epoch": 8.519012675116745, + "loss": 0.4143, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "grad_norm": 1.567345142364502, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "learning_rate": 4.5140256064315136e-05, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "loss": 0.48999351263046265, + "step": 25540 + }, + { + "ce_loss": 0.07218960672616959, + "epoch": 8.519012675116745, + "step": 25540 + }, + { + "distill_loss": 0.16818466782569885, + "epoch": 8.519012675116745, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "ref_ce_loss": 0.09462545812129974, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "loss": 0.4208908677101135, + "step": 25540 + }, + { + "ce_loss": 0.07465120404958725, + "epoch": 8.519012675116745, + "step": 25540 + }, + { + "distill_loss": 0.1858339011669159, + "epoch": 8.519012675116745, + "step": 25540 + }, + { + "epoch": 8.519012675116745, + "ref_ce_loss": 0.11447934061288834, + "step": 25540 + }, + { + "epoch": 8.52234823215477, + "loss": 0.4061, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "grad_norm": 1.4897160530090332, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "learning_rate": 4.494104231019822e-05, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "loss": 0.5165577530860901, + "step": 25550 + }, + { + "ce_loss": 0.08921578526496887, + "epoch": 8.52234823215477, + "step": 25550 + }, + { + "distill_loss": 0.1780136078596115, + "epoch": 8.52234823215477, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "ref_ce_loss": 0.07086190581321716, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "loss": 0.49777376651763916, + "step": 25550 + }, + { + "ce_loss": 0.07921870797872543, + "epoch": 8.52234823215477, + "step": 25550 + }, + { + "distill_loss": 0.22613626718521118, + "epoch": 8.52234823215477, + "step": 25550 + }, + { + "epoch": 8.52234823215477, + "ref_ce_loss": 0.09942245483398438, + "step": 25550 + }, + { + "epoch": 8.525683789192795, + "loss": 0.4983, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "grad_norm": 1.7442575693130493, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "learning_rate": 4.474224294809095e-05, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "loss": 0.3476119637489319, + "step": 25560 + }, + { + "ce_loss": 0.07392597943544388, + "epoch": 8.525683789192795, + "step": 25560 + }, + { + "distill_loss": 0.18355512619018555, + "epoch": 8.525683789192795, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "ref_ce_loss": 0.07576129585504532, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "loss": 0.29269444942474365, + "step": 25560 + }, + { + "ce_loss": 0.03631259873509407, + "epoch": 8.525683789192795, + "step": 25560 + }, + { + "distill_loss": 0.17361724376678467, + "epoch": 8.525683789192795, + "step": 25560 + }, + { + "epoch": 8.525683789192795, + "ref_ce_loss": 0.08257342129945755, + "step": 25560 + }, + { + "epoch": 8.52901934623082, + "loss": 0.4647, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "grad_norm": 1.6793243885040283, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "learning_rate": 4.4543858210013414e-05, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "loss": 0.4669928252696991, + "step": 25570 + }, + { + "ce_loss": 0.03943252190947533, + "epoch": 8.52901934623082, + "step": 25570 + }, + { + "distill_loss": 0.19234731793403625, + "epoch": 8.52901934623082, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "ref_ce_loss": 0.09062094986438751, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "loss": 0.3977088928222656, + "step": 25570 + }, + { + "ce_loss": 0.061139557510614395, + "epoch": 8.52901934623082, + "step": 25570 + }, + { + "distill_loss": 0.1872185468673706, + "epoch": 8.52901934623082, + "step": 25570 + }, + { + "epoch": 8.52901934623082, + "ref_ce_loss": 0.06907083839178085, + "step": 25570 + }, + { + "epoch": 8.532354903268846, + "loss": 0.3617, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "grad_norm": 1.6442162990570068, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "learning_rate": 4.434588832750195e-05, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "loss": 0.4794367551803589, + "step": 25580 + }, + { + "ce_loss": 0.08833171427249908, + "epoch": 8.532354903268846, + "step": 25580 + }, + { + "distill_loss": 0.22108300030231476, + "epoch": 8.532354903268846, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "ref_ce_loss": 0.0778263658285141, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "loss": 0.43841928243637085, + "step": 25580 + }, + { + "ce_loss": 0.07592614740133286, + "epoch": 8.532354903268846, + "step": 25580 + }, + { + "distill_loss": 0.209527850151062, + "epoch": 8.532354903268846, + "step": 25580 + }, + { + "epoch": 8.532354903268846, + "ref_ce_loss": 0.07981114089488983, + "step": 25580 + }, + { + "epoch": 8.53569046030687, + "loss": 0.4094, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "grad_norm": 1.0511441230773926, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "learning_rate": 4.414833353160885e-05, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "loss": 0.3373606204986572, + "step": 25590 + }, + { + "ce_loss": 0.0720352753996849, + "epoch": 8.53569046030687, + "step": 25590 + }, + { + "distill_loss": 0.16815629601478577, + "epoch": 8.53569046030687, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "ref_ce_loss": 0.06618942320346832, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "loss": 0.2670990526676178, + "step": 25590 + }, + { + "ce_loss": 0.025431491434574127, + "epoch": 8.53569046030687, + "step": 25590 + }, + { + "distill_loss": 0.14715948700904846, + "epoch": 8.53569046030687, + "step": 25590 + }, + { + "epoch": 8.53569046030687, + "ref_ce_loss": 0.09422583132982254, + "step": 25590 + }, + { + "epoch": 8.539026017344897, + "loss": 0.3929, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "grad_norm": 1.3763742446899414, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "learning_rate": 4.395119405290178e-05, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "loss": 0.3327689468860626, + "step": 25600 + }, + { + "ce_loss": 0.07418499886989594, + "epoch": 8.539026017344897, + "step": 25600 + }, + { + "distill_loss": 0.17308631539344788, + "epoch": 8.539026017344897, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "ref_ce_loss": 0.06540984660387039, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "loss": 0.38300207257270813, + "step": 25600 + }, + { + "ce_loss": 0.09874434769153595, + "epoch": 8.539026017344897, + "step": 25600 + }, + { + "distill_loss": 0.17336177825927734, + "epoch": 8.539026017344897, + "step": 25600 + }, + { + "epoch": 8.539026017344897, + "ref_ce_loss": 0.07843245565891266, + "step": 25600 + }, + { + "epoch": 8.542361574382921, + "loss": 0.3986, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "grad_norm": 1.043558955192566, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "learning_rate": 4.375447012146361e-05, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "loss": 0.3494413197040558, + "step": 25610 + }, + { + "ce_loss": 0.0686168521642685, + "epoch": 8.542361574382921, + "step": 25610 + }, + { + "distill_loss": 0.19644740223884583, + "epoch": 8.542361574382921, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "ref_ce_loss": 0.06413035839796066, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "loss": 0.2559710741043091, + "step": 25610 + }, + { + "ce_loss": 0.043748125433921814, + "epoch": 8.542361574382921, + "step": 25610 + }, + { + "distill_loss": 0.13352961838245392, + "epoch": 8.542361574382921, + "step": 25610 + }, + { + "epoch": 8.542361574382921, + "ref_ce_loss": 0.05490174889564514, + "step": 25610 + }, + { + "epoch": 8.545697131420948, + "loss": 0.3927, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "grad_norm": 0.8407578468322754, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "learning_rate": 4.355816196689242e-05, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "loss": 0.4915314316749573, + "step": 25620 + }, + { + "ce_loss": 0.08458271622657776, + "epoch": 8.545697131420948, + "step": 25620 + }, + { + "distill_loss": 0.19072310626506805, + "epoch": 8.545697131420948, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "ref_ce_loss": 0.10890834778547287, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "loss": 0.386917382478714, + "step": 25620 + }, + { + "ce_loss": 0.06721224635839462, + "epoch": 8.545697131420948, + "step": 25620 + }, + { + "distill_loss": 0.19537372887134552, + "epoch": 8.545697131420948, + "step": 25620 + }, + { + "epoch": 8.545697131420948, + "ref_ce_loss": 0.06054548919200897, + "step": 25620 + }, + { + "epoch": 8.549032688458972, + "loss": 0.3957, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "grad_norm": 1.364972710609436, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "learning_rate": 4.336226981830094e-05, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "loss": 0.4775812029838562, + "step": 25630 + }, + { + "ce_loss": 0.06949009001255035, + "epoch": 8.549032688458972, + "step": 25630 + }, + { + "distill_loss": 0.20447465777397156, + "epoch": 8.549032688458972, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "ref_ce_loss": 0.061440128833055496, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "loss": 0.37345072627067566, + "step": 25630 + }, + { + "ce_loss": 0.06991104036569595, + "epoch": 8.549032688458972, + "step": 25630 + }, + { + "distill_loss": 0.15843914449214935, + "epoch": 8.549032688458972, + "step": 25630 + }, + { + "epoch": 8.549032688458972, + "ref_ce_loss": 0.09249599277973175, + "step": 25630 + }, + { + "epoch": 8.552368245496998, + "loss": 0.402, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "grad_norm": 1.4104706048965454, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "learning_rate": 4.316679390431637e-05, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "loss": 0.4269067347049713, + "step": 25640 + }, + { + "ce_loss": 0.05708153545856476, + "epoch": 8.552368245496998, + "step": 25640 + }, + { + "distill_loss": 0.18109223246574402, + "epoch": 8.552368245496998, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "ref_ce_loss": 0.07652582973241806, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "loss": 0.2876511514186859, + "step": 25640 + }, + { + "ce_loss": 0.05271648243069649, + "epoch": 8.552368245496998, + "step": 25640 + }, + { + "distill_loss": 0.14108356833457947, + "epoch": 8.552368245496998, + "step": 25640 + }, + { + "epoch": 8.552368245496998, + "ref_ce_loss": 0.06557846069335938, + "step": 25640 + }, + { + "epoch": 8.555703802535023, + "loss": 0.4044, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "grad_norm": 1.0042786598205566, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "learning_rate": 4.297173445308018e-05, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "loss": 0.4267001748085022, + "step": 25650 + }, + { + "ce_loss": 0.06814425438642502, + "epoch": 8.555703802535023, + "step": 25650 + }, + { + "distill_loss": 0.2259957194328308, + "epoch": 8.555703802535023, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "ref_ce_loss": 0.10608452558517456, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "loss": 0.399760901927948, + "step": 25650 + }, + { + "ce_loss": 0.05207909271121025, + "epoch": 8.555703802535023, + "step": 25650 + }, + { + "distill_loss": 0.17557072639465332, + "epoch": 8.555703802535023, + "step": 25650 + }, + { + "epoch": 8.555703802535023, + "ref_ce_loss": 0.08423539251089096, + "step": 25650 + }, + { + "epoch": 8.559039359573049, + "loss": 0.3729, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "grad_norm": 1.627666711807251, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "learning_rate": 4.277709169224773e-05, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "loss": 0.4594741463661194, + "step": 25660 + }, + { + "ce_loss": 0.03643737733364105, + "epoch": 8.559039359573049, + "step": 25660 + }, + { + "distill_loss": 0.18254266679286957, + "epoch": 8.559039359573049, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "ref_ce_loss": 0.0540667325258255, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "loss": 0.32987087965011597, + "step": 25660 + }, + { + "ce_loss": 0.03342566639184952, + "epoch": 8.559039359573049, + "step": 25660 + }, + { + "distill_loss": 0.17751745879650116, + "epoch": 8.559039359573049, + "step": 25660 + }, + { + "epoch": 8.559039359573049, + "ref_ce_loss": 0.0852920189499855, + "step": 25660 + }, + { + "epoch": 8.562374916611073, + "loss": 0.4275, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "grad_norm": 1.3796634674072266, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "learning_rate": 4.2582865848988095e-05, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "loss": 0.43835440278053284, + "step": 25670 + }, + { + "ce_loss": 0.09459662437438965, + "epoch": 8.562374916611073, + "step": 25670 + }, + { + "distill_loss": 0.23603783547878265, + "epoch": 8.562374916611073, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "ref_ce_loss": 0.0750318244099617, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "loss": 0.2934839427471161, + "step": 25670 + }, + { + "ce_loss": 0.039706867188215256, + "epoch": 8.562374916611073, + "step": 25670 + }, + { + "distill_loss": 0.14353084564208984, + "epoch": 8.562374916611073, + "step": 25670 + }, + { + "epoch": 8.562374916611073, + "ref_ce_loss": 0.06643503159284592, + "step": 25670 + }, + { + "epoch": 8.5657104736491, + "loss": 0.4196, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "grad_norm": 0.9201450347900391, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "learning_rate": 4.238905714998365e-05, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "loss": 0.3691575229167938, + "step": 25680 + }, + { + "ce_loss": 0.0561094731092453, + "epoch": 8.5657104736491, + "step": 25680 + }, + { + "distill_loss": 0.17040081322193146, + "epoch": 8.5657104736491, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "ref_ce_loss": 0.07381340861320496, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "loss": 0.38116931915283203, + "step": 25680 + }, + { + "ce_loss": 0.07512297481298447, + "epoch": 8.5657104736491, + "step": 25680 + }, + { + "distill_loss": 0.23427975177764893, + "epoch": 8.5657104736491, + "step": 25680 + }, + { + "epoch": 8.5657104736491, + "ref_ce_loss": 0.07131833583116531, + "step": 25680 + }, + { + "epoch": 8.569046030687124, + "loss": 0.4138, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "grad_norm": 1.5714385509490967, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "learning_rate": 4.219566582143002e-05, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "loss": 0.34663069248199463, + "step": 25690 + }, + { + "ce_loss": 0.05181189998984337, + "epoch": 8.569046030687124, + "step": 25690 + }, + { + "distill_loss": 0.18649733066558838, + "epoch": 8.569046030687124, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "ref_ce_loss": 0.08270205557346344, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "loss": 0.9217813611030579, + "step": 25690 + }, + { + "ce_loss": 0.053318414837121964, + "epoch": 8.569046030687124, + "step": 25690 + }, + { + "distill_loss": 0.20686563849449158, + "epoch": 8.569046030687124, + "step": 25690 + }, + { + "epoch": 8.569046030687124, + "ref_ce_loss": 0.08267318457365036, + "step": 25690 + }, + { + "epoch": 8.57238158772515, + "loss": 0.4133, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "grad_norm": 1.0986888408660889, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "learning_rate": 4.200269208903569e-05, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "loss": 0.4715734124183655, + "step": 25700 + }, + { + "ce_loss": 0.09321790933609009, + "epoch": 8.57238158772515, + "step": 25700 + }, + { + "distill_loss": 0.2323947548866272, + "epoch": 8.57238158772515, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "ref_ce_loss": 0.06556542962789536, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "loss": 0.2592449486255646, + "step": 25700 + }, + { + "ce_loss": 0.02691858820617199, + "epoch": 8.57238158772515, + "step": 25700 + }, + { + "distill_loss": 0.16078926622867584, + "epoch": 8.57238158772515, + "step": 25700 + }, + { + "epoch": 8.57238158772515, + "ref_ce_loss": 0.04746861010789871, + "step": 25700 + }, + { + "epoch": 8.575717144763175, + "loss": 0.4156, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "grad_norm": 3.1051816940307617, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "learning_rate": 4.181013617802192e-05, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "loss": 0.5697551965713501, + "step": 25710 + }, + { + "ce_loss": 0.07349631935358047, + "epoch": 8.575717144763175, + "step": 25710 + }, + { + "distill_loss": 0.2376619577407837, + "epoch": 8.575717144763175, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "ref_ce_loss": 0.07407592982053757, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "loss": 0.354321151971817, + "step": 25710 + }, + { + "ce_loss": 0.058924220502376556, + "epoch": 8.575717144763175, + "step": 25710 + }, + { + "distill_loss": 0.20981810986995697, + "epoch": 8.575717144763175, + "step": 25710 + }, + { + "epoch": 8.575717144763175, + "ref_ce_loss": 0.08513392508029938, + "step": 25710 + }, + { + "epoch": 8.579052701801201, + "loss": 0.4023, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "grad_norm": 1.033412218093872, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "learning_rate": 4.1617998313121966e-05, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "loss": 0.4880424439907074, + "step": 25720 + }, + { + "ce_loss": 0.07538829743862152, + "epoch": 8.579052701801201, + "step": 25720 + }, + { + "distill_loss": 0.2588784992694855, + "epoch": 8.579052701801201, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "ref_ce_loss": 0.06673355400562286, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "loss": 0.4788103699684143, + "step": 25720 + }, + { + "ce_loss": 0.06297007948160172, + "epoch": 8.579052701801201, + "step": 25720 + }, + { + "distill_loss": 0.20087918639183044, + "epoch": 8.579052701801201, + "step": 25720 + }, + { + "epoch": 8.579052701801201, + "ref_ce_loss": 0.07254141569137573, + "step": 25720 + }, + { + "epoch": 8.582388258839226, + "loss": 0.3982, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "grad_norm": 2.35874342918396, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "learning_rate": 4.1426278718581424e-05, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "loss": 0.3429741859436035, + "step": 25730 + }, + { + "ce_loss": 0.059212736785411835, + "epoch": 8.582388258839226, + "step": 25730 + }, + { + "distill_loss": 0.1997862607240677, + "epoch": 8.582388258839226, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "ref_ce_loss": 0.06289255619049072, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "loss": 0.2678738534450531, + "step": 25730 + }, + { + "ce_loss": 0.03431813791394234, + "epoch": 8.582388258839226, + "step": 25730 + }, + { + "distill_loss": 0.1464090496301651, + "epoch": 8.582388258839226, + "step": 25730 + }, + { + "epoch": 8.582388258839226, + "ref_ce_loss": 0.06843450665473938, + "step": 25730 + }, + { + "epoch": 8.585723815877252, + "loss": 0.4033, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "grad_norm": 1.263071894645691, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "learning_rate": 4.123497761815776e-05, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "loss": 0.44260334968566895, + "step": 25740 + }, + { + "ce_loss": 0.08691255003213882, + "epoch": 8.585723815877252, + "step": 25740 + }, + { + "distill_loss": 0.2279985100030899, + "epoch": 8.585723815877252, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "ref_ce_loss": 0.0896274745464325, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "loss": 0.4468820095062256, + "step": 25740 + }, + { + "ce_loss": 0.03885522857308388, + "epoch": 8.585723815877252, + "step": 25740 + }, + { + "distill_loss": 0.14059850573539734, + "epoch": 8.585723815877252, + "step": 25740 + }, + { + "epoch": 8.585723815877252, + "ref_ce_loss": 0.06513622403144836, + "step": 25740 + }, + { + "epoch": 8.589059372915276, + "loss": 0.3826, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "grad_norm": 2.3015244007110596, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "learning_rate": 4.1044095235120004e-05, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "loss": 0.5193451642990112, + "step": 25750 + }, + { + "ce_loss": 0.04930582642555237, + "epoch": 8.589059372915276, + "step": 25750 + }, + { + "distill_loss": 0.19719862937927246, + "epoch": 8.589059372915276, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "ref_ce_loss": 0.07023698091506958, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "loss": 0.40798452496528625, + "step": 25750 + }, + { + "ce_loss": 0.08019145578145981, + "epoch": 8.589059372915276, + "step": 25750 + }, + { + "distill_loss": 0.20547153055667877, + "epoch": 8.589059372915276, + "step": 25750 + }, + { + "epoch": 8.589059372915276, + "ref_ce_loss": 0.08603661507368088, + "step": 25750 + }, + { + "epoch": 8.592394929953302, + "loss": 0.3662, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "grad_norm": 1.2376877069473267, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "learning_rate": 4.085363179224832e-05, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "loss": 0.4375184178352356, + "step": 25760 + }, + { + "ce_loss": 0.11429724097251892, + "epoch": 8.592394929953302, + "step": 25760 + }, + { + "distill_loss": 0.23312397301197052, + "epoch": 8.592394929953302, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "ref_ce_loss": 0.08966071158647537, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "loss": 0.4259558618068695, + "step": 25760 + }, + { + "ce_loss": 0.09264373034238815, + "epoch": 8.592394929953302, + "step": 25760 + }, + { + "distill_loss": 0.19215357303619385, + "epoch": 8.592394929953302, + "step": 25760 + }, + { + "epoch": 8.592394929953302, + "ref_ce_loss": 0.06445678323507309, + "step": 25760 + }, + { + "epoch": 8.595730486991327, + "loss": 0.4045, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "grad_norm": 1.1412529945373535, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "learning_rate": 4.06635875118341e-05, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "loss": 0.28406891226768494, + "step": 25770 + }, + { + "ce_loss": 0.05148237571120262, + "epoch": 8.595730486991327, + "step": 25770 + }, + { + "distill_loss": 0.17395122349262238, + "epoch": 8.595730486991327, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "ref_ce_loss": 0.03983690217137337, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "loss": 0.3416021764278412, + "step": 25770 + }, + { + "ce_loss": 0.08922668546438217, + "epoch": 8.595730486991327, + "step": 25770 + }, + { + "distill_loss": 0.17985227704048157, + "epoch": 8.595730486991327, + "step": 25770 + }, + { + "epoch": 8.595730486991327, + "ref_ce_loss": 0.056731123477220535, + "step": 25770 + }, + { + "epoch": 8.599066044029353, + "loss": 0.462, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "grad_norm": 1.3721615076065063, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "learning_rate": 4.047396261567942e-05, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "loss": 0.543890118598938, + "step": 25780 + }, + { + "ce_loss": 0.05241483822464943, + "epoch": 8.599066044029353, + "step": 25780 + }, + { + "distill_loss": 0.21588771045207977, + "epoch": 8.599066044029353, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "ref_ce_loss": 0.07171756029129028, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "loss": 0.40631309151649475, + "step": 25780 + }, + { + "ce_loss": 0.09465570747852325, + "epoch": 8.599066044029353, + "step": 25780 + }, + { + "distill_loss": 0.22124746441841125, + "epoch": 8.599066044029353, + "step": 25780 + }, + { + "epoch": 8.599066044029353, + "ref_ce_loss": 0.09005552530288696, + "step": 25780 + }, + { + "epoch": 8.602401601067378, + "loss": 0.4049, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "grad_norm": 1.1606225967407227, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "learning_rate": 4.0284757325097066e-05, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "loss": 0.218998983502388, + "step": 25790 + }, + { + "ce_loss": 0.02312006615102291, + "epoch": 8.602401601067378, + "step": 25790 + }, + { + "distill_loss": 0.12065310776233673, + "epoch": 8.602401601067378, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "ref_ce_loss": 0.04846136271953583, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "loss": 0.279868483543396, + "step": 25790 + }, + { + "ce_loss": 0.04042745381593704, + "epoch": 8.602401601067378, + "step": 25790 + }, + { + "distill_loss": 0.15828919410705566, + "epoch": 8.602401601067378, + "step": 25790 + }, + { + "epoch": 8.602401601067378, + "ref_ce_loss": 0.05135457217693329, + "step": 25790 + }, + { + "epoch": 8.605737158105404, + "loss": 0.4405, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "grad_norm": 1.4962409734725952, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "learning_rate": 4.0095971860909784e-05, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "loss": 0.5475419163703918, + "step": 25800 + }, + { + "ce_loss": 0.06993673741817474, + "epoch": 8.605737158105404, + "step": 25800 + }, + { + "distill_loss": 0.1747581511735916, + "epoch": 8.605737158105404, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "ref_ce_loss": 0.07766447216272354, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "loss": 0.4323030114173889, + "step": 25800 + }, + { + "ce_loss": 0.0695943534374237, + "epoch": 8.605737158105404, + "step": 25800 + }, + { + "distill_loss": 0.17964260280132294, + "epoch": 8.605737158105404, + "step": 25800 + }, + { + "epoch": 8.605737158105404, + "ref_ce_loss": 0.08051568269729614, + "step": 25800 + }, + { + "epoch": 8.609072715143428, + "loss": 0.3955, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "grad_norm": 1.6940163373947144, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "learning_rate": 3.9907606443450615e-05, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "loss": 0.45385682582855225, + "step": 25810 + }, + { + "ce_loss": 0.05719995126128197, + "epoch": 8.609072715143428, + "step": 25810 + }, + { + "distill_loss": 0.19794762134552002, + "epoch": 8.609072715143428, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "ref_ce_loss": 0.08196888864040375, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "loss": 0.33113548159599304, + "step": 25810 + }, + { + "ce_loss": 0.07105362415313721, + "epoch": 8.609072715143428, + "step": 25810 + }, + { + "distill_loss": 0.1685401201248169, + "epoch": 8.609072715143428, + "step": 25810 + }, + { + "epoch": 8.609072715143428, + "ref_ce_loss": 0.06278117746114731, + "step": 25810 + }, + { + "epoch": 8.612408272181455, + "loss": 0.3778, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "grad_norm": 1.8563313484191895, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "learning_rate": 3.9719661292562285e-05, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "loss": 0.2569780945777893, + "step": 25820 + }, + { + "ce_loss": 0.0177643820643425, + "epoch": 8.612408272181455, + "step": 25820 + }, + { + "distill_loss": 0.14377647638320923, + "epoch": 8.612408272181455, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "ref_ce_loss": 0.061776284128427505, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "loss": 0.5325338244438171, + "step": 25820 + }, + { + "ce_loss": 0.0745001882314682, + "epoch": 8.612408272181455, + "step": 25820 + }, + { + "distill_loss": 0.21417376399040222, + "epoch": 8.612408272181455, + "step": 25820 + }, + { + "epoch": 8.612408272181455, + "ref_ce_loss": 0.08413225412368774, + "step": 25820 + }, + { + "epoch": 8.615743829219479, + "loss": 0.3739, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "grad_norm": 1.0426115989685059, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "learning_rate": 3.9532136627597094e-05, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "loss": 0.6225663423538208, + "step": 25830 + }, + { + "ce_loss": 0.04983127489686012, + "epoch": 8.615743829219479, + "step": 25830 + }, + { + "distill_loss": 0.183266282081604, + "epoch": 8.615743829219479, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "ref_ce_loss": 0.07555000483989716, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "loss": 0.5039831399917603, + "step": 25830 + }, + { + "ce_loss": 0.09830626845359802, + "epoch": 8.615743829219479, + "step": 25830 + }, + { + "distill_loss": 0.19829033315181732, + "epoch": 8.615743829219479, + "step": 25830 + }, + { + "epoch": 8.615743829219479, + "ref_ce_loss": 0.08459506183862686, + "step": 25830 + }, + { + "epoch": 8.619079386257505, + "loss": 0.3921, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "grad_norm": 1.0925133228302002, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "learning_rate": 3.9345032667416295e-05, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "loss": 0.45747706294059753, + "step": 25840 + }, + { + "ce_loss": 0.0557032972574234, + "epoch": 8.619079386257505, + "step": 25840 + }, + { + "distill_loss": 0.2225656658411026, + "epoch": 8.619079386257505, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "ref_ce_loss": 0.05074243247509003, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "loss": 0.3209838569164276, + "step": 25840 + }, + { + "ce_loss": 0.04327197000384331, + "epoch": 8.619079386257505, + "step": 25840 + }, + { + "distill_loss": 0.1957104504108429, + "epoch": 8.619079386257505, + "step": 25840 + }, + { + "epoch": 8.619079386257505, + "ref_ce_loss": 0.06365178525447845, + "step": 25840 + }, + { + "epoch": 8.62241494329553, + "loss": 0.396, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "grad_norm": 0.9981101155281067, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "learning_rate": 3.91583496303904e-05, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "loss": 0.3310213088989258, + "step": 25850 + }, + { + "ce_loss": 0.05629245564341545, + "epoch": 8.62241494329553, + "step": 25850 + }, + { + "distill_loss": 0.20885275304317474, + "epoch": 8.62241494329553, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "ref_ce_loss": 0.06559545546770096, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "loss": 0.3649715483188629, + "step": 25850 + }, + { + "ce_loss": 0.09426181763410568, + "epoch": 8.62241494329553, + "step": 25850 + }, + { + "distill_loss": 0.19951693713665009, + "epoch": 8.62241494329553, + "step": 25850 + }, + { + "epoch": 8.62241494329553, + "ref_ce_loss": 0.07036200910806656, + "step": 25850 + }, + { + "epoch": 8.625750500333556, + "loss": 0.4081, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "grad_norm": 1.180769681930542, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "learning_rate": 3.897208773439878e-05, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "loss": 0.3026542365550995, + "step": 25860 + }, + { + "ce_loss": 0.05167260766029358, + "epoch": 8.625750500333556, + "step": 25860 + }, + { + "distill_loss": 0.16578106582164764, + "epoch": 8.625750500333556, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "ref_ce_loss": 0.06161735951900482, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "loss": 0.47855818271636963, + "step": 25860 + }, + { + "ce_loss": 0.07787732034921646, + "epoch": 8.625750500333556, + "step": 25860 + }, + { + "distill_loss": 0.21267056465148926, + "epoch": 8.625750500333556, + "step": 25860 + }, + { + "epoch": 8.625750500333556, + "ref_ce_loss": 0.08411581069231033, + "step": 25860 + }, + { + "epoch": 8.62908605737158, + "loss": 0.4078, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "grad_norm": 1.249125599861145, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "learning_rate": 3.878624719682891e-05, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "loss": 0.5326840281486511, + "step": 25870 + }, + { + "ce_loss": 0.09571799635887146, + "epoch": 8.62908605737158, + "step": 25870 + }, + { + "distill_loss": 0.24862737953662872, + "epoch": 8.62908605737158, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "ref_ce_loss": 0.09078372269868851, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "loss": 0.2817167639732361, + "step": 25870 + }, + { + "ce_loss": 0.029775870963931084, + "epoch": 8.62908605737158, + "step": 25870 + }, + { + "distill_loss": 0.15809574723243713, + "epoch": 8.62908605737158, + "step": 25870 + }, + { + "epoch": 8.62908605737158, + "ref_ce_loss": 0.07221907377243042, + "step": 25870 + }, + { + "epoch": 8.632421614409607, + "loss": 0.4131, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "grad_norm": 1.827296257019043, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "learning_rate": 3.8600828234576804e-05, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "loss": 0.41917675733566284, + "step": 25880 + }, + { + "ce_loss": 0.1008242815732956, + "epoch": 8.632421614409607, + "step": 25880 + }, + { + "distill_loss": 0.17240183055400848, + "epoch": 8.632421614409607, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "ref_ce_loss": 0.08299916237592697, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "loss": 0.35314908623695374, + "step": 25880 + }, + { + "ce_loss": 0.05821432173252106, + "epoch": 8.632421614409607, + "step": 25880 + }, + { + "distill_loss": 0.17668472230434418, + "epoch": 8.632421614409607, + "step": 25880 + }, + { + "epoch": 8.632421614409607, + "ref_ce_loss": 0.0680556446313858, + "step": 25880 + }, + { + "epoch": 8.635757171447631, + "loss": 0.4212, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "grad_norm": 1.620147943496704, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "learning_rate": 3.841583106404629e-05, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "loss": 0.31113311648368835, + "step": 25890 + }, + { + "ce_loss": 0.05124642327427864, + "epoch": 8.635757171447631, + "step": 25890 + }, + { + "distill_loss": 0.18941840529441833, + "epoch": 8.635757171447631, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "ref_ce_loss": 0.07025238871574402, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "loss": 0.36012572050094604, + "step": 25890 + }, + { + "ce_loss": 0.053658533841371536, + "epoch": 8.635757171447631, + "step": 25890 + }, + { + "distill_loss": 0.178445965051651, + "epoch": 8.635757171447631, + "step": 25890 + }, + { + "epoch": 8.635757171447631, + "ref_ce_loss": 0.0968189686536789, + "step": 25890 + }, + { + "epoch": 8.639092728485657, + "loss": 0.3521, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "grad_norm": 15.946155548095703, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "learning_rate": 3.823125590114907e-05, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "loss": 0.30347707867622375, + "step": 25900 + }, + { + "ce_loss": 0.043511081486940384, + "epoch": 8.639092728485657, + "step": 25900 + }, + { + "distill_loss": 0.15417037904262543, + "epoch": 8.639092728485657, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "ref_ce_loss": 0.0651017352938652, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "loss": 0.32813578844070435, + "step": 25900 + }, + { + "ce_loss": 0.061788931488990784, + "epoch": 8.639092728485657, + "step": 25900 + }, + { + "distill_loss": 0.1606302708387375, + "epoch": 8.639092728485657, + "step": 25900 + }, + { + "epoch": 8.639092728485657, + "ref_ce_loss": 0.06535312533378601, + "step": 25900 + }, + { + "epoch": 8.642428285523682, + "loss": 0.3467, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "grad_norm": 1.2270931005477905, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "learning_rate": 3.804710296130405e-05, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "loss": 0.3107259273529053, + "step": 25910 + }, + { + "ce_loss": 0.05802823603153229, + "epoch": 8.642428285523682, + "step": 25910 + }, + { + "distill_loss": 0.15126903355121613, + "epoch": 8.642428285523682, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "ref_ce_loss": 0.06690887361764908, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "loss": 0.5598396062850952, + "step": 25910 + }, + { + "ce_loss": 0.06821523606777191, + "epoch": 8.642428285523682, + "step": 25910 + }, + { + "distill_loss": 0.22665062546730042, + "epoch": 8.642428285523682, + "step": 25910 + }, + { + "epoch": 8.642428285523682, + "ref_ce_loss": 0.10408192873001099, + "step": 25910 + }, + { + "epoch": 8.645763842561708, + "loss": 0.4112, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "grad_norm": 1.304826259613037, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "learning_rate": 3.786337245943763e-05, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "loss": 0.3197181820869446, + "step": 25920 + }, + { + "ce_loss": 0.042518723756074905, + "epoch": 8.645763842561708, + "step": 25920 + }, + { + "distill_loss": 0.20466618239879608, + "epoch": 8.645763842561708, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "ref_ce_loss": 0.05158605799078941, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "loss": 0.4145950376987457, + "step": 25920 + }, + { + "ce_loss": 0.09057069569826126, + "epoch": 8.645763842561708, + "step": 25920 + }, + { + "distill_loss": 0.20773470401763916, + "epoch": 8.645763842561708, + "step": 25920 + }, + { + "epoch": 8.645763842561708, + "ref_ce_loss": 0.09864034503698349, + "step": 25920 + }, + { + "epoch": 8.649099399599733, + "loss": 0.3941, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "grad_norm": 1.0435384511947632, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "learning_rate": 3.768006460998303e-05, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "loss": 0.46100571751594543, + "step": 25930 + }, + { + "ce_loss": 0.1150127723813057, + "epoch": 8.649099399599733, + "step": 25930 + }, + { + "distill_loss": 0.19515320658683777, + "epoch": 8.649099399599733, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "ref_ce_loss": 0.0839739441871643, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "loss": 0.26864948868751526, + "step": 25930 + }, + { + "ce_loss": 0.049335312098264694, + "epoch": 8.649099399599733, + "step": 25930 + }, + { + "distill_loss": 0.15795031189918518, + "epoch": 8.649099399599733, + "step": 25930 + }, + { + "epoch": 8.649099399599733, + "ref_ce_loss": 0.06104101613163948, + "step": 25930 + }, + { + "epoch": 8.652434956637759, + "loss": 0.3951, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "grad_norm": 1.5876872539520264, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "learning_rate": 3.749717962688033e-05, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "loss": 0.34112781286239624, + "step": 25940 + }, + { + "ce_loss": 0.04529694467782974, + "epoch": 8.652434956637759, + "step": 25940 + }, + { + "distill_loss": 0.15988574922084808, + "epoch": 8.652434956637759, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "ref_ce_loss": 0.06164150685071945, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "loss": 0.3117307424545288, + "step": 25940 + }, + { + "ce_loss": 0.036282043904066086, + "epoch": 8.652434956637759, + "step": 25940 + }, + { + "distill_loss": 0.18413367867469788, + "epoch": 8.652434956637759, + "step": 25940 + }, + { + "epoch": 8.652434956637759, + "ref_ce_loss": 0.06488938629627228, + "step": 25940 + }, + { + "epoch": 8.655770513675783, + "loss": 0.3999, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "grad_norm": 1.2973941564559937, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "learning_rate": 3.7314717723575934e-05, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "loss": 0.41473087668418884, + "step": 25950 + }, + { + "ce_loss": 0.07483311742544174, + "epoch": 8.655770513675783, + "step": 25950 + }, + { + "distill_loss": 0.17405973374843597, + "epoch": 8.655770513675783, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "ref_ce_loss": 0.08251118659973145, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "loss": 0.46500372886657715, + "step": 25950 + }, + { + "ce_loss": 0.04539445415139198, + "epoch": 8.655770513675783, + "step": 25950 + }, + { + "distill_loss": 0.21199798583984375, + "epoch": 8.655770513675783, + "step": 25950 + }, + { + "epoch": 8.655770513675783, + "ref_ce_loss": 0.06681691110134125, + "step": 25950 + }, + { + "epoch": 8.65910607071381, + "loss": 0.436, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "grad_norm": 1.4216866493225098, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "learning_rate": 3.7132679113022385e-05, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "loss": 0.35822737216949463, + "step": 25960 + }, + { + "ce_loss": 0.04087655246257782, + "epoch": 8.65910607071381, + "step": 25960 + }, + { + "distill_loss": 0.19274145364761353, + "epoch": 8.65910607071381, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "ref_ce_loss": 0.0882585272192955, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "loss": 0.34755536913871765, + "step": 25960 + }, + { + "ce_loss": 0.05266533046960831, + "epoch": 8.65910607071381, + "step": 25960 + }, + { + "distill_loss": 0.17872610688209534, + "epoch": 8.65910607071381, + "step": 25960 + }, + { + "epoch": 8.65910607071381, + "ref_ce_loss": 0.07696732878684998, + "step": 25960 + }, + { + "epoch": 8.662441627751834, + "loss": 0.4989, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "grad_norm": 3.4985039234161377, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "learning_rate": 3.695106400767854e-05, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "loss": 0.3335474133491516, + "step": 25970 + }, + { + "ce_loss": 0.03969917073845863, + "epoch": 8.662441627751834, + "step": 25970 + }, + { + "distill_loss": 0.19208571314811707, + "epoch": 8.662441627751834, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "ref_ce_loss": 0.06494186818599701, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "loss": 0.3575483560562134, + "step": 25970 + }, + { + "ce_loss": 0.059382472187280655, + "epoch": 8.662441627751834, + "step": 25970 + }, + { + "distill_loss": 0.16217409074306488, + "epoch": 8.662441627751834, + "step": 25970 + }, + { + "epoch": 8.662441627751834, + "ref_ce_loss": 0.056934986263513565, + "step": 25970 + }, + { + "epoch": 8.66577718478986, + "loss": 0.3586, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "grad_norm": 1.2654645442962646, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "learning_rate": 3.676987261950875e-05, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "loss": 0.3632966876029968, + "step": 25980 + }, + { + "ce_loss": 0.04773798957467079, + "epoch": 8.66577718478986, + "step": 25980 + }, + { + "distill_loss": 0.16464367508888245, + "epoch": 8.66577718478986, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "ref_ce_loss": 0.050506629049777985, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "loss": 0.2918108403682709, + "step": 25980 + }, + { + "ce_loss": 0.043732013553380966, + "epoch": 8.66577718478986, + "step": 25980 + }, + { + "distill_loss": 0.142713725566864, + "epoch": 8.66577718478986, + "step": 25980 + }, + { + "epoch": 8.66577718478986, + "ref_ce_loss": 0.07189074903726578, + "step": 25980 + }, + { + "epoch": 8.669112741827885, + "loss": 0.4013, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "grad_norm": 1.0764323472976685, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "learning_rate": 3.65891051599827e-05, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "loss": 0.4605330228805542, + "step": 25990 + }, + { + "ce_loss": 0.08776116371154785, + "epoch": 8.669112741827885, + "step": 25990 + }, + { + "distill_loss": 0.21826300024986267, + "epoch": 8.669112741827885, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "ref_ce_loss": 0.09678833931684494, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "loss": 0.4143109619617462, + "step": 25990 + }, + { + "ce_loss": 0.08480346202850342, + "epoch": 8.669112741827885, + "step": 25990 + }, + { + "distill_loss": 0.19863438606262207, + "epoch": 8.669112741827885, + "step": 25990 + }, + { + "epoch": 8.669112741827885, + "ref_ce_loss": 0.10676558315753937, + "step": 25990 + }, + { + "epoch": 8.67244829886591, + "loss": 0.381, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "grad_norm": 1.4833221435546875, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "learning_rate": 3.64087618400756e-05, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "loss": 0.2441224455833435, + "step": 26000 + }, + { + "ce_loss": 0.02226584032177925, + "epoch": 8.67244829886591, + "step": 26000 + }, + { + "distill_loss": 0.1351041942834854, + "epoch": 8.67244829886591, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "ref_ce_loss": 0.0599244050681591, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "loss": 0.31449344754219055, + "step": 26000 + }, + { + "ce_loss": 0.05308458209037781, + "epoch": 8.67244829886591, + "step": 26000 + }, + { + "distill_loss": 0.18237219750881195, + "epoch": 8.67244829886591, + "step": 26000 + }, + { + "epoch": 8.67244829886591, + "ref_ce_loss": 0.06047021597623825, + "step": 26000 + }, + { + "epoch": 8.675783855903935, + "loss": 0.35, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "grad_norm": 1.7155975103378296, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "learning_rate": 3.622884287026742e-05, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "loss": 0.42710235714912415, + "step": 26010 + }, + { + "ce_loss": 0.06678865104913712, + "epoch": 8.675783855903935, + "step": 26010 + }, + { + "distill_loss": 0.18016648292541504, + "epoch": 8.675783855903935, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "ref_ce_loss": 0.0980868712067604, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "loss": 0.5652369260787964, + "step": 26010 + }, + { + "ce_loss": 0.031212544068694115, + "epoch": 8.675783855903935, + "step": 26010 + }, + { + "distill_loss": 0.16358351707458496, + "epoch": 8.675783855903935, + "step": 26010 + }, + { + "epoch": 8.675783855903935, + "ref_ce_loss": 0.0654633492231369, + "step": 26010 + }, + { + "epoch": 8.679119412941962, + "loss": 0.4155, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "grad_norm": 1.209120512008667, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "learning_rate": 3.604934846054309e-05, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "loss": 0.3771757483482361, + "step": 26020 + }, + { + "ce_loss": 0.028000246733427048, + "epoch": 8.679119412941962, + "step": 26020 + }, + { + "distill_loss": 0.14734534919261932, + "epoch": 8.679119412941962, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "ref_ce_loss": 0.061381395906209946, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "loss": 0.21141190826892853, + "step": 26020 + }, + { + "ce_loss": 0.02972903661429882, + "epoch": 8.679119412941962, + "step": 26020 + }, + { + "distill_loss": 0.13676223158836365, + "epoch": 8.679119412941962, + "step": 26020 + }, + { + "epoch": 8.679119412941962, + "ref_ce_loss": 0.0446452833712101, + "step": 26020 + }, + { + "epoch": 8.682454969979986, + "loss": 0.3541, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "grad_norm": 1.1638330221176147, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "learning_rate": 3.5870278820391777e-05, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "loss": 0.5268926024436951, + "step": 26030 + }, + { + "ce_loss": 0.06840582937002182, + "epoch": 8.682454969979986, + "step": 26030 + }, + { + "distill_loss": 0.2558664083480835, + "epoch": 8.682454969979986, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "ref_ce_loss": 0.07109972089529037, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "loss": 0.475436270236969, + "step": 26030 + }, + { + "ce_loss": 0.056556470692157745, + "epoch": 8.682454969979986, + "step": 26030 + }, + { + "distill_loss": 0.22109992802143097, + "epoch": 8.682454969979986, + "step": 26030 + }, + { + "epoch": 8.682454969979986, + "ref_ce_loss": 0.059039387851953506, + "step": 26030 + }, + { + "epoch": 8.685790527018012, + "loss": 0.415, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "grad_norm": 2.160926342010498, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "learning_rate": 3.569163415880703e-05, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "loss": 0.4365619421005249, + "step": 26040 + }, + { + "ce_loss": 0.0994822308421135, + "epoch": 8.685790527018012, + "step": 26040 + }, + { + "distill_loss": 0.21024930477142334, + "epoch": 8.685790527018012, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "ref_ce_loss": 0.06467576324939728, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "loss": 0.24994733929634094, + "step": 26040 + }, + { + "ce_loss": 0.029614150524139404, + "epoch": 8.685790527018012, + "step": 26040 + }, + { + "distill_loss": 0.14794522523880005, + "epoch": 8.685790527018012, + "step": 26040 + }, + { + "epoch": 8.685790527018012, + "ref_ce_loss": 0.07209348678588867, + "step": 26040 + }, + { + "epoch": 8.689126084056037, + "loss": 0.374, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "grad_norm": 1.0623009204864502, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "learning_rate": 3.551341468428642e-05, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "loss": 0.3026094436645508, + "step": 26050 + }, + { + "ce_loss": 0.04317136108875275, + "epoch": 8.689126084056037, + "step": 26050 + }, + { + "distill_loss": 0.13667577505111694, + "epoch": 8.689126084056037, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "ref_ce_loss": 0.046481385827064514, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "loss": 0.4735656678676605, + "step": 26050 + }, + { + "ce_loss": 0.08374132961034775, + "epoch": 8.689126084056037, + "step": 26050 + }, + { + "distill_loss": 0.2770083546638489, + "epoch": 8.689126084056037, + "step": 26050 + }, + { + "epoch": 8.689126084056037, + "ref_ce_loss": 0.08520796149969101, + "step": 26050 + }, + { + "epoch": 8.692461641094063, + "loss": 0.4028, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "grad_norm": 1.1022357940673828, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "learning_rate": 3.533562060483133e-05, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "loss": 0.2600478529930115, + "step": 26060 + }, + { + "ce_loss": 0.04642507806420326, + "epoch": 8.692461641094063, + "step": 26060 + }, + { + "distill_loss": 0.15849417448043823, + "epoch": 8.692461641094063, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "ref_ce_loss": 0.0549236461520195, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "loss": 0.5013510584831238, + "step": 26060 + }, + { + "ce_loss": 0.0545508898794651, + "epoch": 8.692461641094063, + "step": 26060 + }, + { + "distill_loss": 0.19478538632392883, + "epoch": 8.692461641094063, + "step": 26060 + }, + { + "epoch": 8.692461641094063, + "ref_ce_loss": 0.06514310836791992, + "step": 26060 + }, + { + "epoch": 8.695797198132087, + "loss": 0.404, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "grad_norm": 1.3652430772781372, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "learning_rate": 3.515825212794637e-05, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "loss": 0.3231005370616913, + "step": 26070 + }, + { + "ce_loss": 0.062078312039375305, + "epoch": 8.695797198132087, + "step": 26070 + }, + { + "distill_loss": 0.16984239220619202, + "epoch": 8.695797198132087, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "ref_ce_loss": 0.0763070359826088, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "loss": 0.3238498568534851, + "step": 26070 + }, + { + "ce_loss": 0.07663042098283768, + "epoch": 8.695797198132087, + "step": 26070 + }, + { + "distill_loss": 0.17740431427955627, + "epoch": 8.695797198132087, + "step": 26070 + }, + { + "epoch": 8.695797198132087, + "ref_ce_loss": 0.06952373683452606, + "step": 26070 + }, + { + "epoch": 8.699132755170114, + "loss": 0.3645, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "grad_norm": 1.1212877035140991, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "learning_rate": 3.498130946063984e-05, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "loss": 0.48007920384407043, + "step": 26080 + }, + { + "ce_loss": 0.05316156893968582, + "epoch": 8.699132755170114, + "step": 26080 + }, + { + "distill_loss": 0.20693735778331757, + "epoch": 8.699132755170114, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "ref_ce_loss": 0.061040036380290985, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "loss": 0.43342125415802, + "step": 26080 + }, + { + "ce_loss": 0.06705353409051895, + "epoch": 8.699132755170114, + "step": 26080 + }, + { + "distill_loss": 0.21660195291042328, + "epoch": 8.699132755170114, + "step": 26080 + }, + { + "epoch": 8.699132755170114, + "ref_ce_loss": 0.078488789498806, + "step": 26080 + }, + { + "epoch": 8.702468312208138, + "loss": 0.3753, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "grad_norm": 0.9908813238143921, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "learning_rate": 3.4804792809422795e-05, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "loss": 0.2521405518054962, + "step": 26090 + }, + { + "ce_loss": 0.024630185216665268, + "epoch": 8.702468312208138, + "step": 26090 + }, + { + "distill_loss": 0.15732663869857788, + "epoch": 8.702468312208138, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "ref_ce_loss": 0.06997606158256531, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "loss": 0.4396592378616333, + "step": 26090 + }, + { + "ce_loss": 0.06013498455286026, + "epoch": 8.702468312208138, + "step": 26090 + }, + { + "distill_loss": 0.20719094574451447, + "epoch": 8.702468312208138, + "step": 26090 + }, + { + "epoch": 8.702468312208138, + "ref_ce_loss": 0.07338641583919525, + "step": 26090 + }, + { + "epoch": 8.705803869246164, + "loss": 0.3966, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "grad_norm": 1.140104055404663, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "learning_rate": 3.4628702380309263e-05, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "loss": 0.487138032913208, + "step": 26100 + }, + { + "ce_loss": 0.06893015652894974, + "epoch": 8.705803869246164, + "step": 26100 + }, + { + "distill_loss": 0.21911561489105225, + "epoch": 8.705803869246164, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "ref_ce_loss": 0.06975898146629333, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "loss": 0.3690391778945923, + "step": 26100 + }, + { + "ce_loss": 0.04879662021994591, + "epoch": 8.705803869246164, + "step": 26100 + }, + { + "distill_loss": 0.21773314476013184, + "epoch": 8.705803869246164, + "step": 26100 + }, + { + "epoch": 8.705803869246164, + "ref_ce_loss": 0.08336970210075378, + "step": 26100 + }, + { + "epoch": 8.709139426284189, + "loss": 0.4231, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "grad_norm": 1.1086513996124268, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "learning_rate": 3.445303837881557e-05, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "loss": 0.3410007953643799, + "step": 26110 + }, + { + "ce_loss": 0.03825950622558594, + "epoch": 8.709139426284189, + "step": 26110 + }, + { + "distill_loss": 0.1740851104259491, + "epoch": 8.709139426284189, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "ref_ce_loss": 0.04727938026189804, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "loss": 0.4755092263221741, + "step": 26110 + }, + { + "ce_loss": 0.09632537513971329, + "epoch": 8.709139426284189, + "step": 26110 + }, + { + "distill_loss": 0.20118463039398193, + "epoch": 8.709139426284189, + "step": 26110 + }, + { + "epoch": 8.709139426284189, + "ref_ce_loss": 0.08340964466333389, + "step": 26110 + }, + { + "epoch": 8.712474983322215, + "loss": 0.4273, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "grad_norm": 1.3226609230041504, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "learning_rate": 3.427780100996052e-05, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "loss": 0.47836196422576904, + "step": 26120 + }, + { + "ce_loss": 0.06917423009872437, + "epoch": 8.712474983322215, + "step": 26120 + }, + { + "distill_loss": 0.19600701332092285, + "epoch": 8.712474983322215, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "ref_ce_loss": 0.05268669128417969, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "loss": 0.41963690519332886, + "step": 26120 + }, + { + "ce_loss": 0.06492365151643753, + "epoch": 8.712474983322215, + "step": 26120 + }, + { + "distill_loss": 0.14877067506313324, + "epoch": 8.712474983322215, + "step": 26120 + }, + { + "epoch": 8.712474983322215, + "ref_ce_loss": 0.06951910257339478, + "step": 26120 + }, + { + "epoch": 8.71581054036024, + "loss": 0.4161, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "grad_norm": 2.280256509780884, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "learning_rate": 3.4102990478265086e-05, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "loss": 0.32431760430336, + "step": 26130 + }, + { + "ce_loss": 0.07514671236276627, + "epoch": 8.71581054036024, + "step": 26130 + }, + { + "distill_loss": 0.17155833542346954, + "epoch": 8.71581054036024, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "ref_ce_loss": 0.055567435920238495, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "loss": 0.43972310423851013, + "step": 26130 + }, + { + "ce_loss": 0.06478568911552429, + "epoch": 8.71581054036024, + "step": 26130 + }, + { + "distill_loss": 0.19943878054618835, + "epoch": 8.71581054036024, + "step": 26130 + }, + { + "epoch": 8.71581054036024, + "ref_ce_loss": 0.07526401430368423, + "step": 26130 + }, + { + "epoch": 8.719146097398266, + "loss": 0.396, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "grad_norm": 1.3656131029129028, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "learning_rate": 3.392860698775193e-05, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "loss": 0.42818355560302734, + "step": 26140 + }, + { + "ce_loss": 0.10302330553531647, + "epoch": 8.719146097398266, + "step": 26140 + }, + { + "distill_loss": 0.22370949387550354, + "epoch": 8.719146097398266, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "ref_ce_loss": 0.0865187719464302, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "loss": 0.3065904676914215, + "step": 26140 + }, + { + "ce_loss": 0.041094448417425156, + "epoch": 8.719146097398266, + "step": 26140 + }, + { + "distill_loss": 0.17131191492080688, + "epoch": 8.719146097398266, + "step": 26140 + }, + { + "epoch": 8.719146097398266, + "ref_ce_loss": 0.07021956145763397, + "step": 26140 + }, + { + "epoch": 8.72248165443629, + "loss": 0.4599, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "grad_norm": 2.174772024154663, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "learning_rate": 3.3754650741945324e-05, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "loss": 0.48152804374694824, + "step": 26150 + }, + { + "ce_loss": 0.044403236359357834, + "epoch": 8.72248165443629, + "step": 26150 + }, + { + "distill_loss": 0.21540836989879608, + "epoch": 8.72248165443629, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "ref_ce_loss": 0.06427323073148727, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "loss": 1.2604830265045166, + "step": 26150 + }, + { + "ce_loss": 0.10439719259738922, + "epoch": 8.72248165443629, + "step": 26150 + }, + { + "distill_loss": 0.23386874794960022, + "epoch": 8.72248165443629, + "step": 26150 + }, + { + "epoch": 8.72248165443629, + "ref_ce_loss": 0.07590841501951218, + "step": 26150 + }, + { + "epoch": 8.725817211474316, + "loss": 0.4233, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "grad_norm": 0.880950391292572, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "learning_rate": 3.358112194387086e-05, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "loss": 0.34046900272369385, + "step": 26160 + }, + { + "ce_loss": 0.041662558913230896, + "epoch": 8.725817211474316, + "step": 26160 + }, + { + "distill_loss": 0.21017879247665405, + "epoch": 8.725817211474316, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "ref_ce_loss": 0.05803197994828224, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "loss": 0.36259445548057556, + "step": 26160 + }, + { + "ce_loss": 0.07198458164930344, + "epoch": 8.725817211474316, + "step": 26160 + }, + { + "distill_loss": 0.1978611946105957, + "epoch": 8.725817211474316, + "step": 26160 + }, + { + "epoch": 8.725817211474316, + "ref_ce_loss": 0.09256937354803085, + "step": 26160 + }, + { + "epoch": 8.729152768512341, + "loss": 0.3867, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "grad_norm": 1.1352921724319458, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "learning_rate": 3.3408020796055425e-05, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "loss": 0.5861698389053345, + "step": 26170 + }, + { + "ce_loss": 0.06542176008224487, + "epoch": 8.729152768512341, + "step": 26170 + }, + { + "distill_loss": 0.19351504743099213, + "epoch": 8.729152768512341, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "ref_ce_loss": 0.07479386776685715, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "loss": 0.38491737842559814, + "step": 26170 + }, + { + "ce_loss": 0.0400855578482151, + "epoch": 8.729152768512341, + "step": 26170 + }, + { + "distill_loss": 0.19322989881038666, + "epoch": 8.729152768512341, + "step": 26170 + }, + { + "epoch": 8.729152768512341, + "ref_ce_loss": 0.0906819999217987, + "step": 26170 + }, + { + "epoch": 8.732488325550367, + "loss": 0.3808, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "grad_norm": 1.597434639930725, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "learning_rate": 3.323534750052666e-05, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "loss": 0.28560367226600647, + "step": 26180 + }, + { + "ce_loss": 0.02849467098712921, + "epoch": 8.732488325550367, + "step": 26180 + }, + { + "distill_loss": 0.19566433131694794, + "epoch": 8.732488325550367, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "ref_ce_loss": 0.061204612255096436, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "loss": 0.273761510848999, + "step": 26180 + }, + { + "ce_loss": 0.045868728309869766, + "epoch": 8.732488325550367, + "step": 26180 + }, + { + "distill_loss": 0.16468042135238647, + "epoch": 8.732488325550367, + "step": 26180 + }, + { + "epoch": 8.732488325550367, + "ref_ce_loss": 0.062920480966568, + "step": 26180 + }, + { + "epoch": 8.735823882588392, + "loss": 0.3552, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "grad_norm": 1.0125452280044556, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "learning_rate": 3.306310225881286e-05, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "loss": 0.4199540615081787, + "step": 26190 + }, + { + "ce_loss": 0.06449312716722488, + "epoch": 8.735823882588392, + "step": 26190 + }, + { + "distill_loss": 0.19299976527690887, + "epoch": 8.735823882588392, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "ref_ce_loss": 0.08478783071041107, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "loss": 0.8032296895980835, + "step": 26190 + }, + { + "ce_loss": 0.07914341241121292, + "epoch": 8.735823882588392, + "step": 26190 + }, + { + "distill_loss": 0.22294864058494568, + "epoch": 8.735823882588392, + "step": 26190 + }, + { + "epoch": 8.735823882588392, + "ref_ce_loss": 0.06438795477151871, + "step": 26190 + }, + { + "epoch": 8.739159439626418, + "loss": 0.4116, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "grad_norm": 1.4149339199066162, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "learning_rate": 3.289128527194279e-05, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "loss": 0.2872389554977417, + "step": 26200 + }, + { + "ce_loss": 0.05248313397169113, + "epoch": 8.739159439626418, + "step": 26200 + }, + { + "distill_loss": 0.12529276311397552, + "epoch": 8.739159439626418, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "ref_ce_loss": 0.07293683290481567, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "loss": 0.6678260564804077, + "step": 26200 + }, + { + "ce_loss": 0.07362057268619537, + "epoch": 8.739159439626418, + "step": 26200 + }, + { + "distill_loss": 0.201996847987175, + "epoch": 8.739159439626418, + "step": 26200 + }, + { + "epoch": 8.739159439626418, + "ref_ce_loss": 0.06612657755613327, + "step": 26200 + }, + { + "epoch": 8.742494996664442, + "loss": 0.3798, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "grad_norm": 1.198406457901001, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "learning_rate": 3.27198967404454e-05, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "loss": 0.400199830532074, + "step": 26210 + }, + { + "ce_loss": 0.06638126820325851, + "epoch": 8.742494996664442, + "step": 26210 + }, + { + "distill_loss": 0.17453718185424805, + "epoch": 8.742494996664442, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "ref_ce_loss": 0.08012498915195465, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "loss": 0.5893149375915527, + "step": 26210 + }, + { + "ce_loss": 0.05600981041789055, + "epoch": 8.742494996664442, + "step": 26210 + }, + { + "distill_loss": 0.15208479762077332, + "epoch": 8.742494996664442, + "step": 26210 + }, + { + "epoch": 8.742494996664442, + "ref_ce_loss": 0.053888577967882156, + "step": 26210 + }, + { + "epoch": 8.745830553702469, + "loss": 0.4131, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "grad_norm": 1.1500005722045898, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "learning_rate": 3.254893686434941e-05, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "loss": 0.4473916292190552, + "step": 26220 + }, + { + "ce_loss": 0.05827895179390907, + "epoch": 8.745830553702469, + "step": 26220 + }, + { + "distill_loss": 0.23006944358348846, + "epoch": 8.745830553702469, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "ref_ce_loss": 0.08817873150110245, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "loss": 0.5407593250274658, + "step": 26220 + }, + { + "ce_loss": 0.10037504881620407, + "epoch": 8.745830553702469, + "step": 26220 + }, + { + "distill_loss": 0.20246253907680511, + "epoch": 8.745830553702469, + "step": 26220 + }, + { + "epoch": 8.745830553702469, + "ref_ce_loss": 0.09331716597080231, + "step": 26220 + }, + { + "epoch": 8.749166110740493, + "loss": 0.4097, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "grad_norm": 0.9860525131225586, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "learning_rate": 3.2378405843183435e-05, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "loss": 0.6364589929580688, + "step": 26230 + }, + { + "ce_loss": 0.06990605592727661, + "epoch": 8.749166110740493, + "step": 26230 + }, + { + "distill_loss": 0.1852722465991974, + "epoch": 8.749166110740493, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "ref_ce_loss": 0.08063516765832901, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "loss": 0.260394424200058, + "step": 26230 + }, + { + "ce_loss": 0.03426089510321617, + "epoch": 8.749166110740493, + "step": 26230 + }, + { + "distill_loss": 0.1199391782283783, + "epoch": 8.749166110740493, + "step": 26230 + }, + { + "epoch": 8.749166110740493, + "ref_ce_loss": 0.05145851522684097, + "step": 26230 + }, + { + "epoch": 8.75250166777852, + "loss": 0.418, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "grad_norm": 1.1872379779815674, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "learning_rate": 3.22083038759756e-05, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "loss": 0.3817587196826935, + "step": 26240 + }, + { + "ce_loss": 0.054080668836832047, + "epoch": 8.75250166777852, + "step": 26240 + }, + { + "distill_loss": 0.20794880390167236, + "epoch": 8.75250166777852, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "ref_ce_loss": 0.08582180738449097, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "loss": 0.29263123869895935, + "step": 26240 + }, + { + "ce_loss": 0.0385243184864521, + "epoch": 8.75250166777852, + "step": 26240 + }, + { + "distill_loss": 0.20065245032310486, + "epoch": 8.75250166777852, + "step": 26240 + }, + { + "epoch": 8.75250166777852, + "ref_ce_loss": 0.0532672181725502, + "step": 26240 + }, + { + "epoch": 8.755837224816544, + "loss": 0.3957, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "grad_norm": 1.3207815885543823, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "learning_rate": 3.2038631161253226e-05, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "loss": 0.5277407169342041, + "step": 26250 + }, + { + "ce_loss": 0.06554209440946579, + "epoch": 8.755837224816544, + "step": 26250 + }, + { + "distill_loss": 0.21977917850017548, + "epoch": 8.755837224816544, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "ref_ce_loss": 0.07910329848527908, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "loss": 0.3791898787021637, + "step": 26250 + }, + { + "ce_loss": 0.04661162197589874, + "epoch": 8.755837224816544, + "step": 26250 + }, + { + "distill_loss": 0.16154451668262482, + "epoch": 8.755837224816544, + "step": 26250 + }, + { + "epoch": 8.755837224816544, + "ref_ce_loss": 0.06869801133871078, + "step": 26250 + }, + { + "epoch": 8.75917278185457, + "loss": 0.3538, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "grad_norm": 0.9695892333984375, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "learning_rate": 3.18693878970425e-05, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "loss": 0.30223411321640015, + "step": 26260 + }, + { + "ce_loss": 0.0531826950609684, + "epoch": 8.75917278185457, + "step": 26260 + }, + { + "distill_loss": 0.16183307766914368, + "epoch": 8.75917278185457, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "ref_ce_loss": 0.05986183509230614, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "loss": 0.2744012475013733, + "step": 26260 + }, + { + "ce_loss": 0.032525599002838135, + "epoch": 8.75917278185457, + "step": 26260 + }, + { + "distill_loss": 0.15671776235103607, + "epoch": 8.75917278185457, + "step": 26260 + }, + { + "epoch": 8.75917278185457, + "ref_ce_loss": 0.05136071890592575, + "step": 26260 + }, + { + "epoch": 8.762508338892594, + "loss": 0.4084, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "grad_norm": 1.2642502784729004, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "learning_rate": 3.170057428086861e-05, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "loss": 0.5678555369377136, + "step": 26270 + }, + { + "ce_loss": 0.10616932809352875, + "epoch": 8.762508338892594, + "step": 26270 + }, + { + "distill_loss": 0.2803439199924469, + "epoch": 8.762508338892594, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "ref_ce_loss": 0.10130172967910767, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "loss": 0.5963680148124695, + "step": 26270 + }, + { + "ce_loss": 0.0781753733754158, + "epoch": 8.762508338892594, + "step": 26270 + }, + { + "distill_loss": 0.18746884167194366, + "epoch": 8.762508338892594, + "step": 26270 + }, + { + "epoch": 8.762508338892594, + "ref_ce_loss": 0.07577872276306152, + "step": 26270 + }, + { + "epoch": 8.76584389593062, + "loss": 0.3727, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "grad_norm": 1.0161960124969482, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "learning_rate": 3.15321905097552e-05, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "loss": 0.33225592970848083, + "step": 26280 + }, + { + "ce_loss": 0.05595911666750908, + "epoch": 8.76584389593062, + "step": 26280 + }, + { + "distill_loss": 0.20386509597301483, + "epoch": 8.76584389593062, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "ref_ce_loss": 0.051552511751651764, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "loss": 0.2532827854156494, + "step": 26280 + }, + { + "ce_loss": 0.030611203983426094, + "epoch": 8.76584389593062, + "step": 26280 + }, + { + "distill_loss": 0.16473987698554993, + "epoch": 8.76584389593062, + "step": 26280 + }, + { + "epoch": 8.76584389593062, + "ref_ce_loss": 0.0576915368437767, + "step": 26280 + }, + { + "epoch": 8.769179452968645, + "loss": 0.3891, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "grad_norm": 1.3631794452667236, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "learning_rate": 3.136423678022422e-05, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "loss": 0.382942795753479, + "step": 26290 + }, + { + "ce_loss": 0.06846325099468231, + "epoch": 8.769179452968645, + "step": 26290 + }, + { + "distill_loss": 0.14988799393177032, + "epoch": 8.769179452968645, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "ref_ce_loss": 0.10606774687767029, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "loss": 0.6185066103935242, + "step": 26290 + }, + { + "ce_loss": 0.041892632842063904, + "epoch": 8.769179452968645, + "step": 26290 + }, + { + "distill_loss": 0.1824905127286911, + "epoch": 8.769179452968645, + "step": 26290 + }, + { + "epoch": 8.769179452968645, + "ref_ce_loss": 0.06602640450000763, + "step": 26290 + }, + { + "epoch": 8.772515010006671, + "loss": 0.3849, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "grad_norm": 1.2957016229629517, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "learning_rate": 3.119671328829576e-05, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "loss": 0.34795233607292175, + "step": 26300 + }, + { + "ce_loss": 0.07486037164926529, + "epoch": 8.772515010006671, + "step": 26300 + }, + { + "distill_loss": 0.17705968022346497, + "epoch": 8.772515010006671, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "ref_ce_loss": 0.09589973092079163, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "loss": 0.36749204993247986, + "step": 26300 + }, + { + "ce_loss": 0.08413571119308472, + "epoch": 8.772515010006671, + "step": 26300 + }, + { + "distill_loss": 0.1869279444217682, + "epoch": 8.772515010006671, + "step": 26300 + }, + { + "epoch": 8.772515010006671, + "ref_ce_loss": 0.07151706516742706, + "step": 26300 + }, + { + "epoch": 8.775850567044696, + "loss": 0.3681, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "grad_norm": 1.7406237125396729, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "learning_rate": 3.102962022948779e-05, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "loss": 0.4500211179256439, + "step": 26310 + }, + { + "ce_loss": 0.04246877133846283, + "epoch": 8.775850567044696, + "step": 26310 + }, + { + "distill_loss": 0.15817828476428986, + "epoch": 8.775850567044696, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "ref_ce_loss": 0.06595441699028015, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "loss": 0.20872564613819122, + "step": 26310 + }, + { + "ce_loss": 0.028664544224739075, + "epoch": 8.775850567044696, + "step": 26310 + }, + { + "distill_loss": 0.11665079742670059, + "epoch": 8.775850567044696, + "step": 26310 + }, + { + "epoch": 8.775850567044696, + "ref_ce_loss": 0.036202266812324524, + "step": 26310 + }, + { + "epoch": 8.779186124082722, + "loss": 0.4003, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "grad_norm": 1.1860415935516357, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "learning_rate": 3.086295779881585e-05, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "loss": 0.35605388879776, + "step": 26320 + }, + { + "ce_loss": 0.06914496421813965, + "epoch": 8.779186124082722, + "step": 26320 + }, + { + "distill_loss": 0.2034725397825241, + "epoch": 8.779186124082722, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "ref_ce_loss": 0.08318885415792465, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "loss": 0.29098474979400635, + "step": 26320 + }, + { + "ce_loss": 0.07168518006801605, + "epoch": 8.779186124082722, + "step": 26320 + }, + { + "distill_loss": 0.15839485824108124, + "epoch": 8.779186124082722, + "step": 26320 + }, + { + "epoch": 8.779186124082722, + "ref_ce_loss": 0.06056111305952072, + "step": 26320 + }, + { + "epoch": 8.782521681120746, + "loss": 0.3944, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "grad_norm": 1.4006574153900146, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "learning_rate": 3.0696726190793024e-05, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "loss": 0.2608528435230255, + "step": 26330 + }, + { + "ce_loss": 0.04916700720787048, + "epoch": 8.782521681120746, + "step": 26330 + }, + { + "distill_loss": 0.14268429577350616, + "epoch": 8.782521681120746, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "ref_ce_loss": 0.06881540268659592, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "loss": 0.41059139370918274, + "step": 26330 + }, + { + "ce_loss": 0.08422189950942993, + "epoch": 8.782521681120746, + "step": 26330 + }, + { + "distill_loss": 0.20686335861682892, + "epoch": 8.782521681120746, + "step": 26330 + }, + { + "epoch": 8.782521681120746, + "ref_ce_loss": 0.08716470003128052, + "step": 26330 + }, + { + "epoch": 8.785857238158773, + "loss": 0.4231, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "grad_norm": 1.3243461847305298, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "learning_rate": 3.053092559942932e-05, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "loss": 0.3654715418815613, + "step": 26340 + }, + { + "ce_loss": 0.05218815058469772, + "epoch": 8.785857238158773, + "step": 26340 + }, + { + "distill_loss": 0.16668446362018585, + "epoch": 8.785857238158773, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "ref_ce_loss": 0.07640382647514343, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "loss": 0.41766300797462463, + "step": 26340 + }, + { + "ce_loss": 0.07735947519540787, + "epoch": 8.785857238158773, + "step": 26340 + }, + { + "distill_loss": 0.17040522396564484, + "epoch": 8.785857238158773, + "step": 26340 + }, + { + "epoch": 8.785857238158773, + "ref_ce_loss": 0.06796623021364212, + "step": 26340 + }, + { + "epoch": 8.789192795196797, + "loss": 0.4219, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "grad_norm": 1.217641830444336, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "learning_rate": 3.0365556218231983e-05, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "loss": 0.42024436593055725, + "step": 26350 + }, + { + "ce_loss": 0.0894358679652214, + "epoch": 8.789192795196797, + "step": 26350 + }, + { + "distill_loss": 0.1911226511001587, + "epoch": 8.789192795196797, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "ref_ce_loss": 0.06873385608196259, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "loss": 0.4086933434009552, + "step": 26350 + }, + { + "ce_loss": 0.05865989997982979, + "epoch": 8.789192795196797, + "step": 26350 + }, + { + "distill_loss": 0.13806791603565216, + "epoch": 8.789192795196797, + "step": 26350 + }, + { + "epoch": 8.789192795196797, + "ref_ce_loss": 0.0668598860502243, + "step": 26350 + }, + { + "epoch": 8.792528352234823, + "loss": 0.3949, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "grad_norm": 1.2597485780715942, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "learning_rate": 3.020061824020486e-05, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "loss": 0.3705188035964966, + "step": 26360 + }, + { + "ce_loss": 0.05611937865614891, + "epoch": 8.792528352234823, + "step": 26360 + }, + { + "distill_loss": 0.18796519935131073, + "epoch": 8.792528352234823, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "ref_ce_loss": 0.08399634808301926, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "loss": 0.29237592220306396, + "step": 26360 + }, + { + "ce_loss": 0.04235415533185005, + "epoch": 8.792528352234823, + "step": 26360 + }, + { + "distill_loss": 0.16000601649284363, + "epoch": 8.792528352234823, + "step": 26360 + }, + { + "epoch": 8.792528352234823, + "ref_ce_loss": 0.07138751447200775, + "step": 26360 + }, + { + "epoch": 8.795863909272848, + "loss": 0.4432, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "grad_norm": 2.4199059009552, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "learning_rate": 3.003611185784836e-05, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "loss": 0.389001727104187, + "step": 26370 + }, + { + "ce_loss": 0.053413379937410355, + "epoch": 8.795863909272848, + "step": 26370 + }, + { + "distill_loss": 0.17966410517692566, + "epoch": 8.795863909272848, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "ref_ce_loss": 0.0777275413274765, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "loss": 0.272889643907547, + "step": 26370 + }, + { + "ce_loss": 0.05025883764028549, + "epoch": 8.795863909272848, + "step": 26370 + }, + { + "distill_loss": 0.1371891349554062, + "epoch": 8.795863909272848, + "step": 26370 + }, + { + "epoch": 8.795863909272848, + "ref_ce_loss": 0.05508722737431526, + "step": 26370 + }, + { + "epoch": 8.799199466310874, + "loss": 0.3584, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "grad_norm": 1.75760817527771, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "learning_rate": 2.987203726315899e-05, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "loss": 0.6803490519523621, + "step": 26380 + }, + { + "ce_loss": 0.053485460579395294, + "epoch": 8.799199466310874, + "step": 26380 + }, + { + "distill_loss": 0.1900993436574936, + "epoch": 8.799199466310874, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "ref_ce_loss": 0.10119712352752686, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "loss": 0.37675246596336365, + "step": 26380 + }, + { + "ce_loss": 0.06931252777576447, + "epoch": 8.799199466310874, + "step": 26380 + }, + { + "distill_loss": 0.21957695484161377, + "epoch": 8.799199466310874, + "step": 26380 + }, + { + "epoch": 8.799199466310874, + "ref_ce_loss": 0.08745820820331573, + "step": 26380 + }, + { + "epoch": 8.802535023348899, + "loss": 0.4442, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "grad_norm": 1.3044569492340088, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "learning_rate": 2.970839464762958e-05, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "loss": 0.23337514698505402, + "step": 26390 + }, + { + "ce_loss": 0.03274532034993172, + "epoch": 8.802535023348899, + "step": 26390 + }, + { + "distill_loss": 0.12590210139751434, + "epoch": 8.802535023348899, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "ref_ce_loss": 0.05140114203095436, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "loss": 0.3965371549129486, + "step": 26390 + }, + { + "ce_loss": 0.07348036020994186, + "epoch": 8.802535023348899, + "step": 26390 + }, + { + "distill_loss": 0.2069893330335617, + "epoch": 8.802535023348899, + "step": 26390 + }, + { + "epoch": 8.802535023348899, + "ref_ce_loss": 0.08539339900016785, + "step": 26390 + }, + { + "epoch": 8.805870580386925, + "loss": 0.3868, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "grad_norm": 1.3240303993225098, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "learning_rate": 2.954518420224868e-05, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "loss": 0.31848961114883423, + "step": 26400 + }, + { + "ce_loss": 0.07669753581285477, + "epoch": 8.805870580386925, + "step": 26400 + }, + { + "distill_loss": 0.19089281558990479, + "epoch": 8.805870580386925, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "ref_ce_loss": 0.050712063908576965, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "loss": 0.25955644249916077, + "step": 26400 + }, + { + "ce_loss": 0.028958193957805634, + "epoch": 8.805870580386925, + "step": 26400 + }, + { + "distill_loss": 0.16439704596996307, + "epoch": 8.805870580386925, + "step": 26400 + }, + { + "epoch": 8.805870580386925, + "ref_ce_loss": 0.06597182899713516, + "step": 26400 + }, + { + "epoch": 8.80920613742495, + "loss": 0.4087, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "grad_norm": 1.2474710941314697, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "learning_rate": 2.938240611750036e-05, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "loss": 0.38615283370018005, + "step": 26410 + }, + { + "ce_loss": 0.06278189271688461, + "epoch": 8.80920613742495, + "step": 26410 + }, + { + "distill_loss": 0.1992618292570114, + "epoch": 8.80920613742495, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "ref_ce_loss": 0.05499156564474106, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "loss": 0.2698794901371002, + "step": 26410 + }, + { + "ce_loss": 0.056827716529369354, + "epoch": 8.80920613742495, + "step": 26410 + }, + { + "distill_loss": 0.15084534883499146, + "epoch": 8.80920613742495, + "step": 26410 + }, + { + "epoch": 8.80920613742495, + "ref_ce_loss": 0.061909135431051254, + "step": 26410 + }, + { + "epoch": 8.812541694462976, + "loss": 0.3241, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "grad_norm": 0.8669230341911316, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "learning_rate": 2.9220060583364217e-05, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "loss": 0.32821089029312134, + "step": 26420 + }, + { + "ce_loss": 0.05677766725420952, + "epoch": 8.812541694462976, + "step": 26420 + }, + { + "distill_loss": 0.1873900294303894, + "epoch": 8.812541694462976, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "ref_ce_loss": 0.08385615795850754, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "loss": 0.3158455193042755, + "step": 26420 + }, + { + "ce_loss": 0.05159135162830353, + "epoch": 8.812541694462976, + "step": 26420 + }, + { + "distill_loss": 0.15918651223182678, + "epoch": 8.812541694462976, + "step": 26420 + }, + { + "epoch": 8.812541694462976, + "ref_ce_loss": 0.05180855467915535, + "step": 26420 + }, + { + "epoch": 8.815877251501, + "loss": 0.375, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "grad_norm": 1.4044928550720215, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "learning_rate": 2.9058147789314903e-05, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "loss": 0.3125324249267578, + "step": 26430 + }, + { + "ce_loss": 0.05299573391675949, + "epoch": 8.815877251501, + "step": 26430 + }, + { + "distill_loss": 0.14900648593902588, + "epoch": 8.815877251501, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "ref_ce_loss": 0.07863525301218033, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "loss": 0.3457852005958557, + "step": 26430 + }, + { + "ce_loss": 0.055660802870988846, + "epoch": 8.815877251501, + "step": 26430 + }, + { + "distill_loss": 0.16527312994003296, + "epoch": 8.815877251501, + "step": 26430 + }, + { + "epoch": 8.815877251501, + "ref_ce_loss": 0.07229301333427429, + "step": 26430 + }, + { + "epoch": 8.819212808539026, + "loss": 0.3643, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "grad_norm": 5.930783748626709, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "learning_rate": 2.8896667924322153e-05, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "loss": 0.34311580657958984, + "step": 26440 + }, + { + "ce_loss": 0.06892959773540497, + "epoch": 8.819212808539026, + "step": 26440 + }, + { + "distill_loss": 0.214860200881958, + "epoch": 8.819212808539026, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "ref_ce_loss": 0.05908918380737305, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "loss": 0.5133229494094849, + "step": 26440 + }, + { + "ce_loss": 0.08564013987779617, + "epoch": 8.819212808539026, + "step": 26440 + }, + { + "distill_loss": 0.2291836142539978, + "epoch": 8.819212808539026, + "step": 26440 + }, + { + "epoch": 8.819212808539026, + "ref_ce_loss": 0.08884364366531372, + "step": 26440 + }, + { + "epoch": 8.82254836557705, + "loss": 0.3797, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "grad_norm": 1.0255451202392578, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "learning_rate": 2.8735621176850404e-05, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "loss": 0.3774876892566681, + "step": 26450 + }, + { + "ce_loss": 0.03586650267243385, + "epoch": 8.82254836557705, + "step": 26450 + }, + { + "distill_loss": 0.1628786027431488, + "epoch": 8.82254836557705, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "ref_ce_loss": 0.07327239960432053, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "loss": 0.24797579646110535, + "step": 26450 + }, + { + "ce_loss": 0.018110867589712143, + "epoch": 8.82254836557705, + "step": 26450 + }, + { + "distill_loss": 0.14303602278232574, + "epoch": 8.82254836557705, + "step": 26450 + }, + { + "epoch": 8.82254836557705, + "ref_ce_loss": 0.06473203748464584, + "step": 26450 + }, + { + "epoch": 8.825883922615077, + "loss": 0.4209, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "grad_norm": 1.341780424118042, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "learning_rate": 2.8575007734858327e-05, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "loss": 0.28869760036468506, + "step": 26460 + }, + { + "ce_loss": 0.04272546246647835, + "epoch": 8.825883922615077, + "step": 26460 + }, + { + "distill_loss": 0.14046624302864075, + "epoch": 8.825883922615077, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "ref_ce_loss": 0.07356305420398712, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "loss": 0.2093331515789032, + "step": 26460 + }, + { + "ce_loss": 0.016943862661719322, + "epoch": 8.825883922615077, + "step": 26460 + }, + { + "distill_loss": 0.1336776614189148, + "epoch": 8.825883922615077, + "step": 26460 + }, + { + "epoch": 8.825883922615077, + "ref_ce_loss": 0.042915888130664825, + "step": 26460 + }, + { + "epoch": 8.829219479653101, + "loss": 0.3831, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "grad_norm": 2.0843732357025146, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "learning_rate": 2.8414827785799226e-05, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "loss": 0.3446524143218994, + "step": 26470 + }, + { + "ce_loss": 0.05340450629591942, + "epoch": 8.829219479653101, + "step": 26470 + }, + { + "distill_loss": 0.17530488967895508, + "epoch": 8.829219479653101, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "ref_ce_loss": 0.07423309236764908, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "loss": 0.44540131092071533, + "step": 26470 + }, + { + "ce_loss": 0.10081108659505844, + "epoch": 8.829219479653101, + "step": 26470 + }, + { + "distill_loss": 0.1769978553056717, + "epoch": 8.829219479653101, + "step": 26470 + }, + { + "epoch": 8.829219479653101, + "ref_ce_loss": 0.08398478478193283, + "step": 26470 + }, + { + "epoch": 8.832555036691128, + "loss": 0.36, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "grad_norm": 1.5165905952453613, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "learning_rate": 2.8255081516620307e-05, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "loss": 0.3594178557395935, + "step": 26480 + }, + { + "ce_loss": 0.05317145213484764, + "epoch": 8.832555036691128, + "step": 26480 + }, + { + "distill_loss": 0.1571061611175537, + "epoch": 8.832555036691128, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "ref_ce_loss": 0.07676803320646286, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "loss": 0.6816879510879517, + "step": 26480 + }, + { + "ce_loss": 0.05865504592657089, + "epoch": 8.832555036691128, + "step": 26480 + }, + { + "distill_loss": 0.1645684540271759, + "epoch": 8.832555036691128, + "step": 26480 + }, + { + "epoch": 8.832555036691128, + "ref_ce_loss": 0.07839664816856384, + "step": 26480 + }, + { + "epoch": 8.835890593729152, + "loss": 0.4173, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "grad_norm": 1.228607177734375, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "learning_rate": 2.809576911376275e-05, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "loss": 0.3466040790081024, + "step": 26490 + }, + { + "ce_loss": 0.08471109718084335, + "epoch": 8.835890593729152, + "step": 26490 + }, + { + "distill_loss": 0.18720370531082153, + "epoch": 8.835890593729152, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "ref_ce_loss": 0.074448361992836, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "loss": 1.1317849159240723, + "step": 26490 + }, + { + "ce_loss": 0.06539808958768845, + "epoch": 8.835890593729152, + "step": 26490 + }, + { + "distill_loss": 0.1788891851902008, + "epoch": 8.835890593729152, + "step": 26490 + }, + { + "epoch": 8.835890593729152, + "ref_ce_loss": 0.09472262859344482, + "step": 26490 + }, + { + "epoch": 8.839226150767178, + "loss": 0.3817, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "grad_norm": 2.034001350402832, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "learning_rate": 2.7936890763161106e-05, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "loss": 0.7755275368690491, + "step": 26500 + }, + { + "ce_loss": 0.05330686643719673, + "epoch": 8.839226150767178, + "step": 26500 + }, + { + "distill_loss": 0.15255343914031982, + "epoch": 8.839226150767178, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "ref_ce_loss": 0.08559133112430573, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "loss": 0.3958842158317566, + "step": 26500 + }, + { + "ce_loss": 0.08466436713933945, + "epoch": 8.839226150767178, + "step": 26500 + }, + { + "distill_loss": 0.2283436357975006, + "epoch": 8.839226150767178, + "step": 26500 + }, + { + "epoch": 8.839226150767178, + "ref_ce_loss": 0.08261482417583466, + "step": 26500 + }, + { + "epoch": 8.842561707805203, + "loss": 0.3551, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "grad_norm": 1.1143865585327148, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "learning_rate": 2.7778446650243582e-05, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "loss": 0.45422789454460144, + "step": 26510 + }, + { + "ce_loss": 0.05119134113192558, + "epoch": 8.842561707805203, + "step": 26510 + }, + { + "distill_loss": 0.20761536061763763, + "epoch": 8.842561707805203, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "ref_ce_loss": 0.0848504975438118, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "loss": 0.2579534351825714, + "step": 26510 + }, + { + "ce_loss": 0.044855259358882904, + "epoch": 8.842561707805203, + "step": 26510 + }, + { + "distill_loss": 0.12745003402233124, + "epoch": 8.842561707805203, + "step": 26510 + }, + { + "epoch": 8.842561707805203, + "ref_ce_loss": 0.05585559085011482, + "step": 26510 + }, + { + "epoch": 8.845897264843229, + "loss": 0.3267, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "grad_norm": 0.9358294010162354, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "learning_rate": 2.762043695993155e-05, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "loss": 0.38387444615364075, + "step": 26520 + }, + { + "ce_loss": 0.04712322726845741, + "epoch": 8.845897264843229, + "step": 26520 + }, + { + "distill_loss": 0.1761668622493744, + "epoch": 8.845897264843229, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "ref_ce_loss": 0.07620155811309814, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "loss": 1.0314216613769531, + "step": 26520 + }, + { + "ce_loss": 0.0835966244339943, + "epoch": 8.845897264843229, + "step": 26520 + }, + { + "distill_loss": 0.2500387728214264, + "epoch": 8.845897264843229, + "step": 26520 + }, + { + "epoch": 8.845897264843229, + "ref_ce_loss": 0.11306946724653244, + "step": 26520 + }, + { + "epoch": 8.849232821881253, + "loss": 0.4203, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "grad_norm": 2.512843370437622, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "learning_rate": 2.7462861876639223e-05, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "loss": 0.36169400811195374, + "step": 26530 + }, + { + "ce_loss": 0.08441756665706635, + "epoch": 8.849232821881253, + "step": 26530 + }, + { + "distill_loss": 0.18359051644802094, + "epoch": 8.849232821881253, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "ref_ce_loss": 0.09323541074991226, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "loss": 0.4454735219478607, + "step": 26530 + }, + { + "ce_loss": 0.09442145377397537, + "epoch": 8.849232821881253, + "step": 26530 + }, + { + "distill_loss": 0.16280341148376465, + "epoch": 8.849232821881253, + "step": 26530 + }, + { + "epoch": 8.849232821881253, + "ref_ce_loss": 0.07556495815515518, + "step": 26530 + }, + { + "epoch": 8.85256837891928, + "loss": 0.3797, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "grad_norm": 1.3537561893463135, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "learning_rate": 2.7305721584273802e-05, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "loss": 0.34503787755966187, + "step": 26540 + }, + { + "ce_loss": 0.05430424585938454, + "epoch": 8.85256837891928, + "step": 26540 + }, + { + "distill_loss": 0.18627524375915527, + "epoch": 8.85256837891928, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "ref_ce_loss": 0.06884271651506424, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "loss": 0.22353094816207886, + "step": 26540 + }, + { + "ce_loss": 0.028932973742485046, + "epoch": 8.85256837891928, + "step": 26540 + }, + { + "distill_loss": 0.12646234035491943, + "epoch": 8.85256837891928, + "step": 26540 + }, + { + "epoch": 8.85256837891928, + "ref_ce_loss": 0.06783977150917053, + "step": 26540 + }, + { + "epoch": 8.855903935957304, + "loss": 0.4032, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "grad_norm": 1.1834700107574463, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "learning_rate": 2.714901626623485e-05, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "loss": 0.3856170177459717, + "step": 26550 + }, + { + "ce_loss": 0.08056189864873886, + "epoch": 8.855903935957304, + "step": 26550 + }, + { + "distill_loss": 0.19755177199840546, + "epoch": 8.855903935957304, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "ref_ce_loss": 0.06736253947019577, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "loss": 0.31577253341674805, + "step": 26550 + }, + { + "ce_loss": 0.029894186183810234, + "epoch": 8.855903935957304, + "step": 26550 + }, + { + "distill_loss": 0.16263779997825623, + "epoch": 8.855903935957304, + "step": 26550 + }, + { + "epoch": 8.855903935957304, + "ref_ce_loss": 0.06558357179164886, + "step": 26550 + }, + { + "epoch": 8.85923949299533, + "loss": 0.3364, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "grad_norm": 1.1230621337890625, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "learning_rate": 2.699274610541438e-05, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "loss": 0.37808358669281006, + "step": 26560 + }, + { + "ce_loss": 0.03817775472998619, + "epoch": 8.85923949299533, + "step": 26560 + }, + { + "distill_loss": 0.16854511201381683, + "epoch": 8.85923949299533, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "ref_ce_loss": 0.07931677252054214, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "loss": 0.38735297322273254, + "step": 26560 + }, + { + "ce_loss": 0.06041007861495018, + "epoch": 8.85923949299533, + "step": 26560 + }, + { + "distill_loss": 0.2303398847579956, + "epoch": 8.85923949299533, + "step": 26560 + }, + { + "epoch": 8.85923949299533, + "ref_ce_loss": 0.09647249430418015, + "step": 26560 + }, + { + "epoch": 8.862575050033355, + "loss": 0.3923, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "grad_norm": 1.7934776544570923, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "learning_rate": 2.6836911284196363e-05, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "loss": 0.2773299217224121, + "step": 26570 + }, + { + "ce_loss": 0.05012989789247513, + "epoch": 8.862575050033355, + "step": 26570 + }, + { + "distill_loss": 0.1513012945652008, + "epoch": 8.862575050033355, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "ref_ce_loss": 0.05779888108372688, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "loss": 0.5293566584587097, + "step": 26570 + }, + { + "ce_loss": 0.09456038475036621, + "epoch": 8.862575050033355, + "step": 26570 + }, + { + "distill_loss": 0.22474022209644318, + "epoch": 8.862575050033355, + "step": 26570 + }, + { + "epoch": 8.862575050033355, + "ref_ce_loss": 0.06779570877552032, + "step": 26570 + }, + { + "epoch": 8.865910607071381, + "loss": 0.4038, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "grad_norm": 1.2114170789718628, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "learning_rate": 2.668151198445692e-05, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "loss": 0.7759679555892944, + "step": 26580 + }, + { + "ce_loss": 0.08975588530302048, + "epoch": 8.865910607071381, + "step": 26580 + }, + { + "distill_loss": 0.18606550991535187, + "epoch": 8.865910607071381, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "ref_ce_loss": 0.08597087115049362, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "loss": 0.5669875741004944, + "step": 26580 + }, + { + "ce_loss": 0.06825076043605804, + "epoch": 8.865910607071381, + "step": 26580 + }, + { + "distill_loss": 0.17964768409729004, + "epoch": 8.865910607071381, + "step": 26580 + }, + { + "epoch": 8.865910607071381, + "ref_ce_loss": 0.07940545678138733, + "step": 26580 + }, + { + "epoch": 8.869246164109406, + "loss": 0.3841, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "grad_norm": 1.518477439880371, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "learning_rate": 2.6526548387563722e-05, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "loss": 0.4557240605354309, + "step": 26590 + }, + { + "ce_loss": 0.04701073467731476, + "epoch": 8.869246164109406, + "step": 26590 + }, + { + "distill_loss": 0.1662449836730957, + "epoch": 8.869246164109406, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "ref_ce_loss": 0.06742780655622482, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "loss": 0.3855040967464447, + "step": 26590 + }, + { + "ce_loss": 0.08256342262029648, + "epoch": 8.869246164109406, + "step": 26590 + }, + { + "distill_loss": 0.18862977623939514, + "epoch": 8.869246164109406, + "step": 26590 + }, + { + "epoch": 8.869246164109406, + "ref_ce_loss": 0.08041725307703018, + "step": 26590 + }, + { + "epoch": 8.872581721147432, + "loss": 0.3927, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "grad_norm": 1.193103313446045, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "learning_rate": 2.637202067437605e-05, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "loss": 0.2769448161125183, + "step": 26600 + }, + { + "ce_loss": 0.021347135305404663, + "epoch": 8.872581721147432, + "step": 26600 + }, + { + "distill_loss": 0.12924575805664062, + "epoch": 8.872581721147432, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "ref_ce_loss": 0.05707591772079468, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "loss": 0.2787862718105316, + "step": 26600 + }, + { + "ce_loss": 0.028742941096425056, + "epoch": 8.872581721147432, + "step": 26600 + }, + { + "distill_loss": 0.19323575496673584, + "epoch": 8.872581721147432, + "step": 26600 + }, + { + "epoch": 8.872581721147432, + "ref_ce_loss": 0.05664723366498947, + "step": 26600 + }, + { + "epoch": 8.875917278185456, + "loss": 0.3619, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "grad_norm": 0.8265054225921631, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "learning_rate": 2.6217929025244182e-05, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "loss": 0.38113945722579956, + "step": 26610 + }, + { + "ce_loss": 0.037726499140262604, + "epoch": 8.875917278185456, + "step": 26610 + }, + { + "distill_loss": 0.19174793362617493, + "epoch": 8.875917278185456, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "ref_ce_loss": 0.06536515057086945, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "loss": 0.4439065158367157, + "step": 26610 + }, + { + "ce_loss": 0.10350029915571213, + "epoch": 8.875917278185456, + "step": 26610 + }, + { + "distill_loss": 0.19466392695903778, + "epoch": 8.875917278185456, + "step": 26610 + }, + { + "epoch": 8.875917278185456, + "ref_ce_loss": 0.09374602884054184, + "step": 26610 + }, + { + "epoch": 8.879252835223483, + "loss": 0.3905, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "grad_norm": 1.5056222677230835, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "learning_rate": 2.606427362000976e-05, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "loss": 0.9306776523590088, + "step": 26620 + }, + { + "ce_loss": 0.05697760358452797, + "epoch": 8.879252835223483, + "step": 26620 + }, + { + "distill_loss": 0.22343315184116364, + "epoch": 8.879252835223483, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "ref_ce_loss": 0.08988448232412338, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "loss": 0.3787762224674225, + "step": 26620 + }, + { + "ce_loss": 0.08007381856441498, + "epoch": 8.879252835223483, + "step": 26620 + }, + { + "distill_loss": 0.2230757623910904, + "epoch": 8.879252835223483, + "step": 26620 + }, + { + "epoch": 8.879252835223483, + "ref_ce_loss": 0.07537058740854263, + "step": 26620 + }, + { + "epoch": 8.882588392261507, + "loss": 0.4268, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "grad_norm": 1.1199023723602295, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "learning_rate": 2.5911054638005115e-05, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "loss": 0.23840931057929993, + "step": 26630 + }, + { + "ce_loss": 0.028699828311800957, + "epoch": 8.882588392261507, + "step": 26630 + }, + { + "distill_loss": 0.15274080634117126, + "epoch": 8.882588392261507, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "ref_ce_loss": 0.0567190982401371, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "loss": 0.32390138506889343, + "step": 26630 + }, + { + "ce_loss": 0.059554897248744965, + "epoch": 8.882588392261507, + "step": 26630 + }, + { + "distill_loss": 0.1620517522096634, + "epoch": 8.882588392261507, + "step": 26630 + }, + { + "epoch": 8.882588392261507, + "ref_ce_loss": 0.07728970050811768, + "step": 26630 + }, + { + "epoch": 8.885923949299533, + "loss": 0.3893, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "grad_norm": 1.9550362825393677, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "learning_rate": 2.5758272258053473e-05, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "loss": 0.38129478693008423, + "step": 26640 + }, + { + "ce_loss": 0.06399346888065338, + "epoch": 8.885923949299533, + "step": 26640 + }, + { + "distill_loss": 0.19712619483470917, + "epoch": 8.885923949299533, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "ref_ce_loss": 0.08902854472398758, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "loss": 0.5494421720504761, + "step": 26640 + }, + { + "ce_loss": 0.09578592330217361, + "epoch": 8.885923949299533, + "step": 26640 + }, + { + "distill_loss": 0.1795535534620285, + "epoch": 8.885923949299533, + "step": 26640 + }, + { + "epoch": 8.885923949299533, + "ref_ce_loss": 0.07753363251686096, + "step": 26640 + }, + { + "epoch": 8.889259506337558, + "loss": 0.3977, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "grad_norm": 1.3418828248977661, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "learning_rate": 2.5605926658468105e-05, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "loss": 0.5159924626350403, + "step": 26650 + }, + { + "ce_loss": 0.06970398128032684, + "epoch": 8.889259506337558, + "step": 26650 + }, + { + "distill_loss": 0.22379133105278015, + "epoch": 8.889259506337558, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "ref_ce_loss": 0.10952253639698029, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "loss": 0.40654951333999634, + "step": 26650 + }, + { + "ce_loss": 0.061383120715618134, + "epoch": 8.889259506337558, + "step": 26650 + }, + { + "distill_loss": 0.17689788341522217, + "epoch": 8.889259506337558, + "step": 26650 + }, + { + "epoch": 8.889259506337558, + "ref_ce_loss": 0.07978083193302155, + "step": 26650 + }, + { + "epoch": 8.892595063375584, + "loss": 0.4451, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "grad_norm": 1.0386784076690674, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "learning_rate": 2.5454018017052826e-05, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "loss": 0.40768587589263916, + "step": 26660 + }, + { + "ce_loss": 0.06482963263988495, + "epoch": 8.892595063375584, + "step": 26660 + }, + { + "distill_loss": 0.1913122683763504, + "epoch": 8.892595063375584, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "ref_ce_loss": 0.08091914653778076, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "loss": 0.4252420663833618, + "step": 26660 + }, + { + "ce_loss": 0.07114198803901672, + "epoch": 8.892595063375584, + "step": 26660 + }, + { + "distill_loss": 0.18928878009319305, + "epoch": 8.892595063375584, + "step": 26660 + }, + { + "epoch": 8.892595063375584, + "ref_ce_loss": 0.07031705230474472, + "step": 26660 + }, + { + "epoch": 8.895930620413608, + "loss": 0.362, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "grad_norm": 1.1728801727294922, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "learning_rate": 2.5302546511101333e-05, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "loss": 0.3263809084892273, + "step": 26670 + }, + { + "ce_loss": 0.06961500644683838, + "epoch": 8.895930620413608, + "step": 26670 + }, + { + "distill_loss": 0.18741771578788757, + "epoch": 8.895930620413608, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "ref_ce_loss": 0.06905804574489594, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "loss": 0.4924054443836212, + "step": 26670 + }, + { + "ce_loss": 0.12667293846607208, + "epoch": 8.895930620413608, + "step": 26670 + }, + { + "distill_loss": 0.2042117863893509, + "epoch": 8.895930620413608, + "step": 26670 + }, + { + "epoch": 8.895930620413608, + "ref_ce_loss": 0.12243467569351196, + "step": 26670 + }, + { + "epoch": 8.899266177451635, + "loss": 0.4095, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "grad_norm": 1.0906306505203247, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "learning_rate": 2.515151231739723e-05, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "loss": 0.36820027232170105, + "step": 26680 + }, + { + "ce_loss": 0.07361310720443726, + "epoch": 8.899266177451635, + "step": 26680 + }, + { + "distill_loss": 0.17105498909950256, + "epoch": 8.899266177451635, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "ref_ce_loss": 0.09113267064094543, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "loss": 0.29275742173194885, + "step": 26680 + }, + { + "ce_loss": 0.0631311684846878, + "epoch": 8.899266177451635, + "step": 26680 + }, + { + "distill_loss": 0.16057291626930237, + "epoch": 8.899266177451635, + "step": 26680 + }, + { + "epoch": 8.899266177451635, + "ref_ce_loss": 0.05351213365793228, + "step": 26680 + }, + { + "epoch": 8.902601734489659, + "loss": 0.4033, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "grad_norm": 1.6852984428405762, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "learning_rate": 2.500091561221356e-05, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "loss": 0.31909945607185364, + "step": 26690 + }, + { + "ce_loss": 0.043618910014629364, + "epoch": 8.902601734489659, + "step": 26690 + }, + { + "distill_loss": 0.14286518096923828, + "epoch": 8.902601734489659, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "ref_ce_loss": 0.057593002915382385, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "loss": 0.22959819436073303, + "step": 26690 + }, + { + "ce_loss": 0.028676237910985947, + "epoch": 8.902601734489659, + "step": 26690 + }, + { + "distill_loss": 0.12937778234481812, + "epoch": 8.902601734489659, + "step": 26690 + }, + { + "epoch": 8.902601734489659, + "ref_ce_loss": 0.05122470483183861, + "step": 26690 + }, + { + "epoch": 8.905937291527685, + "loss": 0.3535, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "grad_norm": 1.6896381378173828, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "learning_rate": 2.4850756571312927e-05, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "loss": 0.21083098649978638, + "step": 26700 + }, + { + "ce_loss": 0.02656303346157074, + "epoch": 8.905937291527685, + "step": 26700 + }, + { + "distill_loss": 0.11085355281829834, + "epoch": 8.905937291527685, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "ref_ce_loss": 0.04174497351050377, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "loss": 0.41885051131248474, + "step": 26700 + }, + { + "ce_loss": 0.03569251671433449, + "epoch": 8.905937291527685, + "step": 26700 + }, + { + "distill_loss": 0.15860214829444885, + "epoch": 8.905937291527685, + "step": 26700 + }, + { + "epoch": 8.905937291527685, + "ref_ce_loss": 0.07976032793521881, + "step": 26700 + }, + { + "epoch": 8.90927284856571, + "loss": 0.4428, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "grad_norm": 2.085545778274536, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "learning_rate": 2.47010353699471e-05, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "loss": 0.35387659072875977, + "step": 26710 + }, + { + "ce_loss": 0.04973173886537552, + "epoch": 8.90927284856571, + "step": 26710 + }, + { + "distill_loss": 0.16496436297893524, + "epoch": 8.90927284856571, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "ref_ce_loss": 0.06505236029624939, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "loss": 0.35952889919281006, + "step": 26710 + }, + { + "ce_loss": 0.08064170926809311, + "epoch": 8.90927284856571, + "step": 26710 + }, + { + "distill_loss": 0.18596360087394714, + "epoch": 8.90927284856571, + "step": 26710 + }, + { + "epoch": 8.90927284856571, + "ref_ce_loss": 0.07266595214605331, + "step": 26710 + }, + { + "epoch": 8.912608405603736, + "loss": 0.4175, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "grad_norm": 2.2813949584960938, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "learning_rate": 2.4551752182856923e-05, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "loss": 0.408029168844223, + "step": 26720 + }, + { + "ce_loss": 0.05037664622068405, + "epoch": 8.912608405603736, + "step": 26720 + }, + { + "distill_loss": 0.19227397441864014, + "epoch": 8.912608405603736, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "ref_ce_loss": 0.08918608725070953, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "loss": 0.5027357339859009, + "step": 26720 + }, + { + "ce_loss": 0.10362458229064941, + "epoch": 8.912608405603736, + "step": 26720 + }, + { + "distill_loss": 0.19801238179206848, + "epoch": 8.912608405603736, + "step": 26720 + }, + { + "epoch": 8.912608405603736, + "ref_ce_loss": 0.11099687218666077, + "step": 26720 + }, + { + "epoch": 8.91594396264176, + "loss": 0.4118, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "grad_norm": 1.9336222410202026, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "learning_rate": 2.4402907184271783e-05, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "loss": 0.29472947120666504, + "step": 26730 + }, + { + "ce_loss": 0.027242260053753853, + "epoch": 8.91594396264176, + "step": 26730 + }, + { + "distill_loss": 0.18607719242572784, + "epoch": 8.91594396264176, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "ref_ce_loss": 0.058557845652103424, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "loss": 0.3766292929649353, + "step": 26730 + }, + { + "ce_loss": 0.051607754081487656, + "epoch": 8.91594396264176, + "step": 26730 + }, + { + "distill_loss": 0.19452683627605438, + "epoch": 8.91594396264176, + "step": 26730 + }, + { + "epoch": 8.91594396264176, + "ref_ce_loss": 0.08398887515068054, + "step": 26730 + }, + { + "epoch": 8.919279519679787, + "loss": 0.3726, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "grad_norm": 1.3201515674591064, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "learning_rate": 2.4254500547909787e-05, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "loss": 0.31672608852386475, + "step": 26740 + }, + { + "ce_loss": 0.07216809689998627, + "epoch": 8.919279519679787, + "step": 26740 + }, + { + "distill_loss": 0.16296908259391785, + "epoch": 8.919279519679787, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "ref_ce_loss": 0.08134753257036209, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "loss": 0.37821874022483826, + "step": 26740 + }, + { + "ce_loss": 0.06907200068235397, + "epoch": 8.919279519679787, + "step": 26740 + }, + { + "distill_loss": 0.2010176181793213, + "epoch": 8.919279519679787, + "step": 26740 + }, + { + "epoch": 8.919279519679787, + "ref_ce_loss": 0.08417536318302155, + "step": 26740 + }, + { + "epoch": 8.922615076717811, + "loss": 0.4093, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "grad_norm": 1.8416671752929688, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "learning_rate": 2.4106532446977715e-05, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "loss": 0.3364465832710266, + "step": 26750 + }, + { + "ce_loss": 0.0730133056640625, + "epoch": 8.922615076717811, + "step": 26750 + }, + { + "distill_loss": 0.14422747492790222, + "epoch": 8.922615076717811, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "ref_ce_loss": 0.06264062970876694, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "loss": 0.33681172132492065, + "step": 26750 + }, + { + "ce_loss": 0.05357135459780693, + "epoch": 8.922615076717811, + "step": 26750 + }, + { + "distill_loss": 0.16947956383228302, + "epoch": 8.922615076717811, + "step": 26750 + }, + { + "epoch": 8.922615076717811, + "ref_ce_loss": 0.07739750295877457, + "step": 26750 + }, + { + "epoch": 8.925950633755837, + "loss": 0.3788, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "grad_norm": 1.0190787315368652, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "learning_rate": 2.395900305416996e-05, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "loss": 0.34849247336387634, + "step": 26760 + }, + { + "ce_loss": 0.04305924475193024, + "epoch": 8.925950633755837, + "step": 26760 + }, + { + "distill_loss": 0.20099864900112152, + "epoch": 8.925950633755837, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "ref_ce_loss": 0.07525210082530975, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "loss": 0.42576754093170166, + "step": 26760 + }, + { + "ce_loss": 0.05474647879600525, + "epoch": 8.925950633755837, + "step": 26760 + }, + { + "distill_loss": 0.17419682443141937, + "epoch": 8.925950633755837, + "step": 26760 + }, + { + "epoch": 8.925950633755837, + "ref_ce_loss": 0.05943509563803673, + "step": 26760 + }, + { + "epoch": 8.929286190793862, + "loss": 0.4248, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "grad_norm": 1.1845428943634033, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "learning_rate": 2.3811912541669368e-05, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "loss": 0.6398705840110779, + "step": 26770 + }, + { + "ce_loss": 0.07201068103313446, + "epoch": 8.929286190793862, + "step": 26770 + }, + { + "distill_loss": 0.18719375133514404, + "epoch": 8.929286190793862, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "ref_ce_loss": 0.07512786239385605, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "loss": 0.5534118413925171, + "step": 26770 + }, + { + "ce_loss": 0.058874137699604034, + "epoch": 8.929286190793862, + "step": 26770 + }, + { + "distill_loss": 0.22906601428985596, + "epoch": 8.929286190793862, + "step": 26770 + }, + { + "epoch": 8.929286190793862, + "ref_ce_loss": 0.09107699245214462, + "step": 26770 + }, + { + "epoch": 8.932621747831888, + "loss": 0.3822, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "grad_norm": 1.1596795320510864, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "learning_rate": 2.366526108114635e-05, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "loss": 0.37828749418258667, + "step": 26780 + }, + { + "ce_loss": 0.09381183981895447, + "epoch": 8.932621747831888, + "step": 26780 + }, + { + "distill_loss": 0.19912533462047577, + "epoch": 8.932621747831888, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "ref_ce_loss": 0.037565797567367554, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "loss": 0.2733290493488312, + "step": 26780 + }, + { + "ce_loss": 0.06196272000670433, + "epoch": 8.932621747831888, + "step": 26780 + }, + { + "distill_loss": 0.14887866377830505, + "epoch": 8.932621747831888, + "step": 26780 + }, + { + "epoch": 8.932621747831888, + "ref_ce_loss": 0.062240395694971085, + "step": 26780 + }, + { + "epoch": 8.935957304869913, + "loss": 0.4107, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "grad_norm": 1.1044889688491821, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "learning_rate": 2.3519048843758973e-05, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "loss": 0.27776286005973816, + "step": 26790 + }, + { + "ce_loss": 0.04569617286324501, + "epoch": 8.935957304869913, + "step": 26790 + }, + { + "distill_loss": 0.17505523562431335, + "epoch": 8.935957304869913, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "ref_ce_loss": 0.05621863901615143, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "loss": 0.3137989044189453, + "step": 26790 + }, + { + "ce_loss": 0.04200056195259094, + "epoch": 8.935957304869913, + "step": 26790 + }, + { + "distill_loss": 0.17961399257183075, + "epoch": 8.935957304869913, + "step": 26790 + }, + { + "epoch": 8.935957304869913, + "ref_ce_loss": 0.06352463364601135, + "step": 26790 + }, + { + "epoch": 8.939292861907939, + "loss": 0.3986, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "grad_norm": 1.3312222957611084, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "learning_rate": 2.3373276000152645e-05, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "loss": 0.7195945978164673, + "step": 26800 + }, + { + "ce_loss": 0.05876842886209488, + "epoch": 8.939292861907939, + "step": 26800 + }, + { + "distill_loss": 0.21101140975952148, + "epoch": 8.939292861907939, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "ref_ce_loss": 0.09533131867647171, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "loss": 0.4126310646533966, + "step": 26800 + }, + { + "ce_loss": 0.042031995952129364, + "epoch": 8.939292861907939, + "step": 26800 + }, + { + "distill_loss": 0.1630733758211136, + "epoch": 8.939292861907939, + "step": 26800 + }, + { + "epoch": 8.939292861907939, + "ref_ce_loss": 0.07871302217245102, + "step": 26800 + }, + { + "epoch": 8.942628418945963, + "loss": 0.443, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "grad_norm": 2.133390426635742, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "learning_rate": 2.3227942720459896e-05, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "loss": 0.36603066325187683, + "step": 26810 + }, + { + "ce_loss": 0.05009675770998001, + "epoch": 8.942628418945963, + "step": 26810 + }, + { + "distill_loss": 0.21553771197795868, + "epoch": 8.942628418945963, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "ref_ce_loss": 0.10018293559551239, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "loss": 0.30181068181991577, + "step": 26810 + }, + { + "ce_loss": 0.0363122820854187, + "epoch": 8.942628418945963, + "step": 26810 + }, + { + "distill_loss": 0.1500847041606903, + "epoch": 8.942628418945963, + "step": 26810 + }, + { + "epoch": 8.942628418945963, + "ref_ce_loss": 0.08789963275194168, + "step": 26810 + }, + { + "epoch": 8.94596397598399, + "loss": 0.3884, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "grad_norm": 2.869149684906006, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "learning_rate": 2.3083049174300332e-05, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "loss": 0.3147127032279968, + "step": 26820 + }, + { + "ce_loss": 0.02978779561817646, + "epoch": 8.94596397598399, + "step": 26820 + }, + { + "distill_loss": 0.20026488602161407, + "epoch": 8.94596397598399, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "ref_ce_loss": 0.0839177593588829, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "loss": 0.27547311782836914, + "step": 26820 + }, + { + "ce_loss": 0.040116142481565475, + "epoch": 8.94596397598399, + "step": 26820 + }, + { + "distill_loss": 0.16665637493133545, + "epoch": 8.94596397598399, + "step": 26820 + }, + { + "epoch": 8.94596397598399, + "ref_ce_loss": 0.0683833584189415, + "step": 26820 + }, + { + "epoch": 8.949299533022014, + "loss": 0.3734, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "grad_norm": 1.3682441711425781, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "learning_rate": 2.2938595530780325e-05, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "loss": 0.37970811128616333, + "step": 26830 + }, + { + "ce_loss": 0.042396221309900284, + "epoch": 8.949299533022014, + "step": 26830 + }, + { + "distill_loss": 0.17948472499847412, + "epoch": 8.949299533022014, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "ref_ce_loss": 0.08113700896501541, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "loss": 0.3110351860523224, + "step": 26830 + }, + { + "ce_loss": 0.04640982300043106, + "epoch": 8.949299533022014, + "step": 26830 + }, + { + "distill_loss": 0.16055357456207275, + "epoch": 8.949299533022014, + "step": 26830 + }, + { + "epoch": 8.949299533022014, + "ref_ce_loss": 0.07340537011623383, + "step": 26830 + }, + { + "epoch": 8.95263509006004, + "loss": 0.3662, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "grad_norm": 1.3402122259140015, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "learning_rate": 2.279458195849289e-05, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "loss": 0.5659681558609009, + "step": 26840 + }, + { + "ce_loss": 0.03422752767801285, + "epoch": 8.95263509006004, + "step": 26840 + }, + { + "distill_loss": 0.18524156510829926, + "epoch": 8.95263509006004, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "ref_ce_loss": 0.06659182906150818, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "loss": 0.4884268045425415, + "step": 26840 + }, + { + "ce_loss": 0.10238605737686157, + "epoch": 8.95263509006004, + "step": 26840 + }, + { + "distill_loss": 0.224435955286026, + "epoch": 8.95263509006004, + "step": 26840 + }, + { + "epoch": 8.95263509006004, + "ref_ce_loss": 0.10202771425247192, + "step": 26840 + }, + { + "epoch": 8.955970647098065, + "loss": 0.435, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "grad_norm": 1.8279234170913696, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "learning_rate": 2.2651008625517168e-05, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "loss": 0.3548109531402588, + "step": 26850 + }, + { + "ce_loss": 0.059652846306562424, + "epoch": 8.955970647098065, + "step": 26850 + }, + { + "distill_loss": 0.15193244814872742, + "epoch": 8.955970647098065, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "ref_ce_loss": 0.0689244195818901, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "loss": 0.31553858518600464, + "step": 26850 + }, + { + "ce_loss": 0.0573444664478302, + "epoch": 8.955970647098065, + "step": 26850 + }, + { + "distill_loss": 0.18329612910747528, + "epoch": 8.955970647098065, + "step": 26850 + }, + { + "epoch": 8.955970647098065, + "ref_ce_loss": 0.05450303852558136, + "step": 26850 + }, + { + "epoch": 8.959306204136091, + "loss": 0.3977, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "grad_norm": 1.8529592752456665, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "learning_rate": 2.2507875699418855e-05, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "loss": 0.35732439160346985, + "step": 26860 + }, + { + "ce_loss": 0.06432899832725525, + "epoch": 8.959306204136091, + "step": 26860 + }, + { + "distill_loss": 0.1751275658607483, + "epoch": 8.959306204136091, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "ref_ce_loss": 0.0556345172226429, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "loss": 0.2738136351108551, + "step": 26860 + }, + { + "ce_loss": 0.02206704579293728, + "epoch": 8.959306204136091, + "step": 26860 + }, + { + "distill_loss": 0.16342735290527344, + "epoch": 8.959306204136091, + "step": 26860 + }, + { + "epoch": 8.959306204136091, + "ref_ce_loss": 0.04390689358115196, + "step": 26860 + }, + { + "epoch": 8.962641761174115, + "loss": 0.3404, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "grad_norm": 1.2462575435638428, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "learning_rate": 2.2365183347249486e-05, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "loss": 0.34319478273391724, + "step": 26870 + }, + { + "ce_loss": 0.06394265592098236, + "epoch": 8.962641761174115, + "step": 26870 + }, + { + "distill_loss": 0.2054232507944107, + "epoch": 8.962641761174115, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "ref_ce_loss": 0.05607668310403824, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "loss": 0.4374099373817444, + "step": 26870 + }, + { + "ce_loss": 0.05983159318566322, + "epoch": 8.962641761174115, + "step": 26870 + }, + { + "distill_loss": 0.2247617542743683, + "epoch": 8.962641761174115, + "step": 26870 + }, + { + "epoch": 8.962641761174115, + "ref_ce_loss": 0.07600671797990799, + "step": 26870 + }, + { + "epoch": 8.965977318212142, + "loss": 0.3653, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "grad_norm": 1.3279523849487305, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "learning_rate": 2.2222931735546327e-05, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "loss": 0.263740599155426, + "step": 26880 + }, + { + "ce_loss": 0.0419314019382, + "epoch": 8.965977318212142, + "step": 26880 + }, + { + "distill_loss": 0.14178217947483063, + "epoch": 8.965977318212142, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "ref_ce_loss": 0.07977408170700073, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "loss": 0.33606958389282227, + "step": 26880 + }, + { + "ce_loss": 0.0492708757519722, + "epoch": 8.965977318212142, + "step": 26880 + }, + { + "distill_loss": 0.1874486356973648, + "epoch": 8.965977318212142, + "step": 26880 + }, + { + "epoch": 8.965977318212142, + "ref_ce_loss": 0.07295677810907364, + "step": 26880 + }, + { + "epoch": 8.969312875250166, + "loss": 0.3834, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "grad_norm": 0.9264974594116211, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "learning_rate": 2.2081121030332262e-05, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "loss": 0.5124914050102234, + "step": 26890 + }, + { + "ce_loss": 0.07620886713266373, + "epoch": 8.969312875250166, + "step": 26890 + }, + { + "distill_loss": 0.25074541568756104, + "epoch": 8.969312875250166, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "ref_ce_loss": 0.08912394195795059, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "loss": 0.2802472412586212, + "step": 26890 + }, + { + "ce_loss": 0.044443145394325256, + "epoch": 8.969312875250166, + "step": 26890 + }, + { + "distill_loss": 0.14432059228420258, + "epoch": 8.969312875250166, + "step": 26890 + }, + { + "epoch": 8.969312875250166, + "ref_ce_loss": 0.06065668910741806, + "step": 26890 + }, + { + "epoch": 8.972648432288192, + "loss": 0.3987, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "grad_norm": 1.4514647722244263, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "learning_rate": 2.193975139711575e-05, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "loss": 0.48129957914352417, + "step": 26900 + }, + { + "ce_loss": 0.05864886939525604, + "epoch": 8.972648432288192, + "step": 26900 + }, + { + "distill_loss": 0.16718582808971405, + "epoch": 8.972648432288192, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "ref_ce_loss": 0.06964981555938721, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "loss": 0.4144725799560547, + "step": 26900 + }, + { + "ce_loss": 0.06540128588676453, + "epoch": 8.972648432288192, + "step": 26900 + }, + { + "distill_loss": 0.24699874222278595, + "epoch": 8.972648432288192, + "step": 26900 + }, + { + "epoch": 8.972648432288192, + "ref_ce_loss": 0.07367391884326935, + "step": 26900 + }, + { + "epoch": 8.975983989326217, + "loss": 0.3679, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "grad_norm": 1.1033375263214111, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "learning_rate": 2.17988230008904e-05, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "loss": 0.2607868015766144, + "step": 26910 + }, + { + "ce_loss": 0.02850893698632717, + "epoch": 8.975983989326217, + "step": 26910 + }, + { + "distill_loss": 0.1603696495294571, + "epoch": 8.975983989326217, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "ref_ce_loss": 0.04794909805059433, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "loss": 0.4003443717956543, + "step": 26910 + }, + { + "ce_loss": 0.05196845531463623, + "epoch": 8.975983989326217, + "step": 26910 + }, + { + "distill_loss": 0.1829056739807129, + "epoch": 8.975983989326217, + "step": 26910 + }, + { + "epoch": 8.975983989326217, + "ref_ce_loss": 0.08970125019550323, + "step": 26910 + }, + { + "epoch": 8.979319546364243, + "loss": 0.3785, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "grad_norm": 1.235166072845459, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "learning_rate": 2.165833600613465e-05, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "loss": 0.2534266710281372, + "step": 26920 + }, + { + "ce_loss": 0.03887072950601578, + "epoch": 8.979319546364243, + "step": 26920 + }, + { + "distill_loss": 0.1585645079612732, + "epoch": 8.979319546364243, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "ref_ce_loss": 0.05570882931351662, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "loss": 0.35729390382766724, + "step": 26920 + }, + { + "ce_loss": 0.04741301015019417, + "epoch": 8.979319546364243, + "step": 26920 + }, + { + "distill_loss": 0.1947993040084839, + "epoch": 8.979319546364243, + "step": 26920 + }, + { + "epoch": 8.979319546364243, + "ref_ce_loss": 0.08480067551136017, + "step": 26920 + }, + { + "epoch": 8.982655103402267, + "loss": 0.3672, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "grad_norm": 1.1416871547698975, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "learning_rate": 2.151829057681205e-05, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "loss": 0.30041593313217163, + "step": 26930 + }, + { + "ce_loss": 0.03660650923848152, + "epoch": 8.982655103402267, + "step": 26930 + }, + { + "distill_loss": 0.14040933549404144, + "epoch": 8.982655103402267, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "ref_ce_loss": 0.06650853157043457, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "loss": 0.3716757595539093, + "step": 26930 + }, + { + "ce_loss": 0.05143218860030174, + "epoch": 8.982655103402267, + "step": 26930 + }, + { + "distill_loss": 0.1850205957889557, + "epoch": 8.982655103402267, + "step": 26930 + }, + { + "epoch": 8.982655103402267, + "ref_ce_loss": 0.10274700075387955, + "step": 26930 + }, + { + "epoch": 8.985990660440294, + "loss": 0.385, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "grad_norm": 1.246756672859192, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "learning_rate": 2.1378686876370656e-05, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "loss": 0.2395678460597992, + "step": 26940 + }, + { + "ce_loss": 0.02067268081009388, + "epoch": 8.985990660440294, + "step": 26940 + }, + { + "distill_loss": 0.14293518662452698, + "epoch": 8.985990660440294, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "ref_ce_loss": 0.04838182032108307, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "loss": 0.23595669865608215, + "step": 26940 + }, + { + "ce_loss": 0.02198474481701851, + "epoch": 8.985990660440294, + "step": 26940 + }, + { + "distill_loss": 0.12834428250789642, + "epoch": 8.985990660440294, + "step": 26940 + }, + { + "epoch": 8.985990660440294, + "ref_ce_loss": 0.05260973051190376, + "step": 26940 + }, + { + "epoch": 8.989326217478318, + "loss": 0.3931, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "grad_norm": 1.2465322017669678, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "learning_rate": 2.123952506774307e-05, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "loss": 0.4282079339027405, + "step": 26950 + }, + { + "ce_loss": 0.10361059755086899, + "epoch": 8.989326217478318, + "step": 26950 + }, + { + "distill_loss": 0.19960245490074158, + "epoch": 8.989326217478318, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "ref_ce_loss": 0.09390628337860107, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "loss": 0.4041498005390167, + "step": 26950 + }, + { + "ce_loss": 0.06759005039930344, + "epoch": 8.989326217478318, + "step": 26950 + }, + { + "distill_loss": 0.20927953720092773, + "epoch": 8.989326217478318, + "step": 26950 + }, + { + "epoch": 8.989326217478318, + "ref_ce_loss": 0.07374574989080429, + "step": 26950 + }, + { + "epoch": 8.992661774516344, + "loss": 0.3814, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "grad_norm": 1.1587697267532349, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "learning_rate": 2.1100805313345907e-05, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "loss": 0.46586868166923523, + "step": 26960 + }, + { + "ce_loss": 0.05573577433824539, + "epoch": 8.992661774516344, + "step": 26960 + }, + { + "distill_loss": 0.2000192403793335, + "epoch": 8.992661774516344, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "ref_ce_loss": 0.06393946707248688, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "loss": 0.25798797607421875, + "step": 26960 + }, + { + "ce_loss": 0.027636967599391937, + "epoch": 8.992661774516344, + "step": 26960 + }, + { + "distill_loss": 0.1450037807226181, + "epoch": 8.992661774516344, + "step": 26960 + }, + { + "epoch": 8.992661774516344, + "ref_ce_loss": 0.058194465935230255, + "step": 26960 + }, + { + "epoch": 8.995997331554369, + "loss": 0.3803, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "grad_norm": 1.6340363025665283, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "learning_rate": 2.0962527775080275e-05, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "loss": 0.418100506067276, + "step": 26970 + }, + { + "ce_loss": 0.07010974735021591, + "epoch": 8.995997331554369, + "step": 26970 + }, + { + "distill_loss": 0.2157282531261444, + "epoch": 8.995997331554369, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "ref_ce_loss": 0.09423127770423889, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "loss": 0.7165571451187134, + "step": 26970 + }, + { + "ce_loss": 0.06447438895702362, + "epoch": 8.995997331554369, + "step": 26970 + }, + { + "distill_loss": 0.15642520785331726, + "epoch": 8.995997331554369, + "step": 26970 + }, + { + "epoch": 8.995997331554369, + "ref_ce_loss": 0.08733781427145004, + "step": 26970 + }, + { + "epoch": 8.999332888592395, + "loss": 0.4126, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "grad_norm": 1.3778339624404907, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "learning_rate": 2.082469261433082e-05, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "loss": 0.5340480208396912, + "step": 26980 + }, + { + "ce_loss": 0.07798661291599274, + "epoch": 8.999332888592395, + "step": 26980 + }, + { + "distill_loss": 0.21405644714832306, + "epoch": 8.999332888592395, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "ref_ce_loss": 0.08211926370859146, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "loss": 0.4084705710411072, + "step": 26980 + }, + { + "ce_loss": 0.07932519912719727, + "epoch": 8.999332888592395, + "step": 26980 + }, + { + "distill_loss": 0.18646341562271118, + "epoch": 8.999332888592395, + "step": 26980 + }, + { + "epoch": 8.999332888592395, + "ref_ce_loss": 0.07593373954296112, + "step": 26980 + }, + { + "epoch": 9.00266844563042, + "loss": 0.3665, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "grad_norm": 0.8887509107589722, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "learning_rate": 2.0687299991966104e-05, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "loss": 0.2786753177642822, + "step": 26990 + }, + { + "ce_loss": 0.03038201667368412, + "epoch": 9.00266844563042, + "step": 26990 + }, + { + "distill_loss": 0.15614908933639526, + "epoch": 9.00266844563042, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "ref_ce_loss": 0.053884122520685196, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "loss": 0.38568711280822754, + "step": 26990 + }, + { + "ce_loss": 0.053761474788188934, + "epoch": 9.00266844563042, + "step": 26990 + }, + { + "distill_loss": 0.22754529118537903, + "epoch": 9.00266844563042, + "step": 26990 + }, + { + "epoch": 9.00266844563042, + "ref_ce_loss": 0.07468869537115097, + "step": 26990 + }, + { + "epoch": 9.006004002668446, + "loss": 0.3239, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "grad_norm": 0.9528130292892456, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "learning_rate": 2.0550350068337987e-05, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "loss": 0.32287442684173584, + "step": 27000 + }, + { + "ce_loss": 0.03582875803112984, + "epoch": 9.006004002668446, + "step": 27000 + }, + { + "distill_loss": 0.19158251583576202, + "epoch": 9.006004002668446, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "ref_ce_loss": 0.06319230049848557, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "loss": 0.4253171682357788, + "step": 27000 + }, + { + "ce_loss": 0.03548054024577141, + "epoch": 9.006004002668446, + "step": 27000 + }, + { + "distill_loss": 0.15410447120666504, + "epoch": 9.006004002668446, + "step": 27000 + }, + { + "epoch": 9.006004002668446, + "ref_ce_loss": 0.060177017003297806, + "step": 27000 + }, + { + "epoch": 9.00933955970647, + "loss": 0.3901, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "grad_norm": 1.0065354108810425, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "learning_rate": 2.0413843003281818e-05, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "loss": 0.22900214791297913, + "step": 27010 + }, + { + "ce_loss": 0.020226458087563515, + "epoch": 9.00933955970647, + "step": 27010 + }, + { + "distill_loss": 0.14763259887695312, + "epoch": 9.00933955970647, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "ref_ce_loss": 0.051381126046180725, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "loss": 0.28583458065986633, + "step": 27010 + }, + { + "ce_loss": 0.04080132767558098, + "epoch": 9.00933955970647, + "step": 27010 + }, + { + "distill_loss": 0.1587204784154892, + "epoch": 9.00933955970647, + "step": 27010 + }, + { + "epoch": 9.00933955970647, + "ref_ce_loss": 0.06302133202552795, + "step": 27010 + }, + { + "epoch": 9.012675116744497, + "loss": 0.3127, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "grad_norm": 1.1079349517822266, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "learning_rate": 2.0277778956116023e-05, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "loss": 0.29996344447135925, + "step": 27020 + }, + { + "ce_loss": 0.03723428398370743, + "epoch": 9.012675116744497, + "step": 27020 + }, + { + "distill_loss": 0.19037845730781555, + "epoch": 9.012675116744497, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "ref_ce_loss": 0.05403125286102295, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "loss": 0.26005294919013977, + "step": 27020 + }, + { + "ce_loss": 0.017900634557008743, + "epoch": 9.012675116744497, + "step": 27020 + }, + { + "distill_loss": 0.10674063116312027, + "epoch": 9.012675116744497, + "step": 27020 + }, + { + "epoch": 9.012675116744497, + "ref_ce_loss": 0.04297463968396187, + "step": 27020 + }, + { + "epoch": 9.016010673782521, + "loss": 0.334, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "grad_norm": 0.9277923107147217, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "learning_rate": 2.0142158085642014e-05, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "loss": 0.32305458188056946, + "step": 27030 + }, + { + "ce_loss": 0.03489192947745323, + "epoch": 9.016010673782521, + "step": 27030 + }, + { + "distill_loss": 0.22198455035686493, + "epoch": 9.016010673782521, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "ref_ce_loss": 0.06594966351985931, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "loss": 0.24616390466690063, + "step": 27030 + }, + { + "ce_loss": 0.04013339802622795, + "epoch": 9.016010673782521, + "step": 27030 + }, + { + "distill_loss": 0.14001591503620148, + "epoch": 9.016010673782521, + "step": 27030 + }, + { + "epoch": 9.016010673782521, + "ref_ce_loss": 0.04386579245328903, + "step": 27030 + }, + { + "epoch": 9.019346230820547, + "loss": 0.3458, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "grad_norm": 0.9151402711868286, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "learning_rate": 2.0006980550143893e-05, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "loss": 0.3370954990386963, + "step": 27040 + }, + { + "ce_loss": 0.030183374881744385, + "epoch": 9.019346230820547, + "step": 27040 + }, + { + "distill_loss": 0.17306430637836456, + "epoch": 9.019346230820547, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "ref_ce_loss": 0.06660886108875275, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "loss": 0.39963436126708984, + "step": 27040 + }, + { + "ce_loss": 0.03261478245258331, + "epoch": 9.019346230820547, + "step": 27040 + }, + { + "distill_loss": 0.18940173089504242, + "epoch": 9.019346230820547, + "step": 27040 + }, + { + "epoch": 9.019346230820547, + "ref_ce_loss": 0.06742677092552185, + "step": 27040 + }, + { + "epoch": 9.022681787858572, + "loss": 0.3546, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "grad_norm": 0.9811869263648987, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "learning_rate": 1.9872246507388394e-05, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "loss": 0.20993368327617645, + "step": 27050 + }, + { + "ce_loss": 0.01603628695011139, + "epoch": 9.022681787858572, + "step": 27050 + }, + { + "distill_loss": 0.13603952527046204, + "epoch": 9.022681787858572, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "ref_ce_loss": 0.04286492243409157, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "loss": 0.4756922125816345, + "step": 27050 + }, + { + "ce_loss": 0.060111287981271744, + "epoch": 9.022681787858572, + "step": 27050 + }, + { + "distill_loss": 0.2132369875907898, + "epoch": 9.022681787858572, + "step": 27050 + }, + { + "epoch": 9.022681787858572, + "ref_ce_loss": 0.07596529275178909, + "step": 27050 + }, + { + "epoch": 9.026017344896598, + "loss": 0.3577, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "grad_norm": 1.1416993141174316, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "learning_rate": 1.9737956114624655e-05, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "loss": 0.35747769474983215, + "step": 27060 + }, + { + "ce_loss": 0.03079773299396038, + "epoch": 9.026017344896598, + "step": 27060 + }, + { + "distill_loss": 0.2055409699678421, + "epoch": 9.026017344896598, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "ref_ce_loss": 0.08336233347654343, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "loss": 0.2289038896560669, + "step": 27060 + }, + { + "ce_loss": 0.009864287450909615, + "epoch": 9.026017344896598, + "step": 27060 + }, + { + "distill_loss": 0.12856513261795044, + "epoch": 9.026017344896598, + "step": 27060 + }, + { + "epoch": 9.026017344896598, + "ref_ce_loss": 0.059212226420640945, + "step": 27060 + }, + { + "epoch": 9.029352901934622, + "loss": 0.36, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "grad_norm": 1.1386539936065674, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "learning_rate": 1.9604109528584025e-05, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "loss": 0.297063946723938, + "step": 27070 + }, + { + "ce_loss": 0.017145255580544472, + "epoch": 9.029352901934622, + "step": 27070 + }, + { + "distill_loss": 0.20598909258842468, + "epoch": 9.029352901934622, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "ref_ce_loss": 0.07353837788105011, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "loss": 0.2827029228210449, + "step": 27070 + }, + { + "ce_loss": 0.020517654716968536, + "epoch": 9.029352901934622, + "step": 27070 + }, + { + "distill_loss": 0.18179626762866974, + "epoch": 9.029352901934622, + "step": 27070 + }, + { + "epoch": 9.029352901934622, + "ref_ce_loss": 0.08018902689218521, + "step": 27070 + }, + { + "epoch": 9.032688458972649, + "loss": 0.3358, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "grad_norm": 1.0653818845748901, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "learning_rate": 1.9470706905479897e-05, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "loss": 0.2931612432003021, + "step": 27080 + }, + { + "ce_loss": 0.04542126879096031, + "epoch": 9.032688458972649, + "step": 27080 + }, + { + "distill_loss": 0.16980938613414764, + "epoch": 9.032688458972649, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "ref_ce_loss": 0.054527461528778076, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "loss": 0.28220316767692566, + "step": 27080 + }, + { + "ce_loss": 0.06899258494377136, + "epoch": 9.032688458972649, + "step": 27080 + }, + { + "distill_loss": 0.14135973155498505, + "epoch": 9.032688458972649, + "step": 27080 + }, + { + "epoch": 9.032688458972649, + "ref_ce_loss": 0.07163500785827637, + "step": 27080 + }, + { + "epoch": 9.036024016010673, + "loss": 0.3818, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "grad_norm": 0.8686916828155518, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "learning_rate": 1.9337748401007418e-05, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "loss": 0.3689887523651123, + "step": 27090 + }, + { + "ce_loss": 0.041207872331142426, + "epoch": 9.036024016010673, + "step": 27090 + }, + { + "distill_loss": 0.18242886662483215, + "epoch": 9.036024016010673, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "ref_ce_loss": 0.06660563498735428, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "loss": 0.3805079460144043, + "step": 27090 + }, + { + "ce_loss": 0.06088177487254143, + "epoch": 9.036024016010673, + "step": 27090 + }, + { + "distill_loss": 0.2561386823654175, + "epoch": 9.036024016010673, + "step": 27090 + }, + { + "epoch": 9.036024016010673, + "ref_ce_loss": 0.06328141689300537, + "step": 27090 + }, + { + "epoch": 9.0393595730487, + "loss": 0.3654, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "grad_norm": 1.8930615186691284, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "learning_rate": 1.9205234170343567e-05, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "loss": 0.2853085994720459, + "step": 27100 + }, + { + "ce_loss": 0.028148401528596878, + "epoch": 9.0393595730487, + "step": 27100 + }, + { + "distill_loss": 0.1661919206380844, + "epoch": 9.0393595730487, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "ref_ce_loss": 0.055086735635995865, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "loss": 0.34833627939224243, + "step": 27100 + }, + { + "ce_loss": 0.036774538457393646, + "epoch": 9.0393595730487, + "step": 27100 + }, + { + "distill_loss": 0.14977332949638367, + "epoch": 9.0393595730487, + "step": 27100 + }, + { + "epoch": 9.0393595730487, + "ref_ce_loss": 0.0676417276263237, + "step": 27100 + }, + { + "epoch": 9.042695130086724, + "loss": 0.3574, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "grad_norm": 0.9850945472717285, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "learning_rate": 1.907316436814659e-05, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "loss": 0.3692997395992279, + "step": 27110 + }, + { + "ce_loss": 0.030617495998740196, + "epoch": 9.042695130086724, + "step": 27110 + }, + { + "distill_loss": 0.17378318309783936, + "epoch": 9.042695130086724, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "ref_ce_loss": 0.07603063434362411, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "loss": 0.4435376822948456, + "step": 27110 + }, + { + "ce_loss": 0.05087882652878761, + "epoch": 9.042695130086724, + "step": 27110 + }, + { + "distill_loss": 0.19516976177692413, + "epoch": 9.042695130086724, + "step": 27110 + }, + { + "epoch": 9.042695130086724, + "ref_ce_loss": 0.0701480358839035, + "step": 27110 + }, + { + "epoch": 9.04603068712475, + "loss": 0.3588, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "grad_norm": 1.261003017425537, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "learning_rate": 1.894153914855625e-05, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "loss": 0.2516244947910309, + "step": 27120 + }, + { + "ce_loss": 0.01986541971564293, + "epoch": 9.04603068712475, + "step": 27120 + }, + { + "distill_loss": 0.18128304183483124, + "epoch": 9.04603068712475, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "ref_ce_loss": 0.04984374716877937, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "loss": 0.37950924038887024, + "step": 27120 + }, + { + "ce_loss": 0.0460323728621006, + "epoch": 9.04603068712475, + "step": 27120 + }, + { + "distill_loss": 0.21613475680351257, + "epoch": 9.04603068712475, + "step": 27120 + }, + { + "epoch": 9.04603068712475, + "ref_ce_loss": 0.07548118382692337, + "step": 27120 + }, + { + "epoch": 9.049366244162774, + "loss": 0.3741, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "grad_norm": 1.4003740549087524, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "learning_rate": 1.8810358665193273e-05, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "loss": 0.24134016036987305, + "step": 27130 + }, + { + "ce_loss": 0.01722213812172413, + "epoch": 9.049366244162774, + "step": 27130 + }, + { + "distill_loss": 0.12212380766868591, + "epoch": 9.049366244162774, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "ref_ce_loss": 0.04552808403968811, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "loss": 0.29700788855552673, + "step": 27130 + }, + { + "ce_loss": 0.013893130235373974, + "epoch": 9.049366244162774, + "step": 27130 + }, + { + "distill_loss": 0.1976793259382248, + "epoch": 9.049366244162774, + "step": 27130 + }, + { + "epoch": 9.049366244162774, + "ref_ce_loss": 0.05386512726545334, + "step": 27130 + }, + { + "epoch": 9.0527018012008, + "loss": 0.3745, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "grad_norm": 1.0930335521697998, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "learning_rate": 1.8679623071159535e-05, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "loss": 0.2979638874530792, + "step": 27140 + }, + { + "ce_loss": 0.038029931485652924, + "epoch": 9.0527018012008, + "step": 27140 + }, + { + "distill_loss": 0.17696678638458252, + "epoch": 9.0527018012008, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "ref_ce_loss": 0.0641922727227211, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "loss": 0.3175976276397705, + "step": 27140 + }, + { + "ce_loss": 0.027088569477200508, + "epoch": 9.0527018012008, + "step": 27140 + }, + { + "distill_loss": 0.12965548038482666, + "epoch": 9.0527018012008, + "step": 27140 + }, + { + "epoch": 9.0527018012008, + "ref_ce_loss": 0.07547727227210999, + "step": 27140 + }, + { + "epoch": 9.056037358238825, + "loss": 0.3573, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "grad_norm": 1.1945284605026245, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "learning_rate": 1.8549332519037344e-05, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "loss": 0.369720458984375, + "step": 27150 + }, + { + "ce_loss": 0.041470445692539215, + "epoch": 9.056037358238825, + "step": 27150 + }, + { + "distill_loss": 0.24201340973377228, + "epoch": 9.056037358238825, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "ref_ce_loss": 0.058247197419404984, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "loss": 0.2370813488960266, + "step": 27150 + }, + { + "ce_loss": 0.011418032459914684, + "epoch": 9.056037358238825, + "step": 27150 + }, + { + "distill_loss": 0.13475269079208374, + "epoch": 9.056037358238825, + "step": 27150 + }, + { + "epoch": 9.056037358238825, + "ref_ce_loss": 0.059003476053476334, + "step": 27150 + }, + { + "epoch": 9.059372915276851, + "loss": 0.3416, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "grad_norm": 0.990787923336029, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "learning_rate": 1.8419487160889947e-05, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "loss": 0.33093833923339844, + "step": 27160 + }, + { + "ce_loss": 0.05214123800396919, + "epoch": 9.059372915276851, + "step": 27160 + }, + { + "distill_loss": 0.19237744808197021, + "epoch": 9.059372915276851, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "ref_ce_loss": 0.06416904181241989, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "loss": 0.3283519446849823, + "step": 27160 + }, + { + "ce_loss": 0.037675682455301285, + "epoch": 9.059372915276851, + "step": 27160 + }, + { + "distill_loss": 0.18677571415901184, + "epoch": 9.059372915276851, + "step": 27160 + }, + { + "epoch": 9.059372915276851, + "ref_ce_loss": 0.07229268550872803, + "step": 27160 + }, + { + "epoch": 9.062708472314876, + "loss": 0.3212, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "grad_norm": 1.0613186359405518, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "learning_rate": 1.8290087148260748e-05, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "loss": 0.17752593755722046, + "step": 27170 + }, + { + "ce_loss": 0.021205492317676544, + "epoch": 9.062708472314876, + "step": 27170 + }, + { + "distill_loss": 0.10737836360931396, + "epoch": 9.062708472314876, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "ref_ce_loss": 0.03590012341737747, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "loss": 0.38357293605804443, + "step": 27170 + }, + { + "ce_loss": 0.07017988711595535, + "epoch": 9.062708472314876, + "step": 27170 + }, + { + "distill_loss": 0.16169802844524384, + "epoch": 9.062708472314876, + "step": 27170 + }, + { + "epoch": 9.062708472314876, + "ref_ce_loss": 0.07344874739646912, + "step": 27170 + }, + { + "epoch": 9.066044029352902, + "loss": 0.3508, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "grad_norm": 1.3445948362350464, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "learning_rate": 1.8161132632173562e-05, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "loss": 0.3429279625415802, + "step": 27180 + }, + { + "ce_loss": 0.03333231061697006, + "epoch": 9.066044029352902, + "step": 27180 + }, + { + "distill_loss": 0.1579761505126953, + "epoch": 9.066044029352902, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "ref_ce_loss": 0.06460767239332199, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "loss": 0.53684401512146, + "step": 27180 + }, + { + "ce_loss": 0.0448799729347229, + "epoch": 9.066044029352902, + "step": 27180 + }, + { + "distill_loss": 0.23155061900615692, + "epoch": 9.066044029352902, + "step": 27180 + }, + { + "epoch": 9.066044029352902, + "ref_ce_loss": 0.06540628522634506, + "step": 27180 + }, + { + "epoch": 9.069379586390927, + "loss": 0.3759, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "grad_norm": 1.0961995124816895, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "learning_rate": 1.803262376313213e-05, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "loss": 0.26204830408096313, + "step": 27190 + }, + { + "ce_loss": 0.033201009035110474, + "epoch": 9.069379586390927, + "step": 27190 + }, + { + "distill_loss": 0.1589648425579071, + "epoch": 9.069379586390927, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "ref_ce_loss": 0.05141977593302727, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "loss": 0.2719113826751709, + "step": 27190 + }, + { + "ce_loss": 0.031158355996012688, + "epoch": 9.069379586390927, + "step": 27190 + }, + { + "distill_loss": 0.160211443901062, + "epoch": 9.069379586390927, + "step": 27190 + }, + { + "epoch": 9.069379586390927, + "ref_ce_loss": 0.05910457298159599, + "step": 27190 + }, + { + "epoch": 9.072715143428953, + "loss": 0.3287, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "grad_norm": 0.9663136005401611, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "learning_rate": 1.7904560691120164e-05, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "loss": 0.41447609663009644, + "step": 27200 + }, + { + "ce_loss": 0.04455767944455147, + "epoch": 9.072715143428953, + "step": 27200 + }, + { + "distill_loss": 0.2078428715467453, + "epoch": 9.072715143428953, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "ref_ce_loss": 0.08008597791194916, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "loss": 0.2740389406681061, + "step": 27200 + }, + { + "ce_loss": 0.026906346902251244, + "epoch": 9.072715143428953, + "step": 27200 + }, + { + "distill_loss": 0.13863641023635864, + "epoch": 9.072715143428953, + "step": 27200 + }, + { + "epoch": 9.072715143428953, + "ref_ce_loss": 0.0535602904856205, + "step": 27200 + }, + { + "epoch": 9.076050700466977, + "loss": 0.3602, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "grad_norm": 1.1693758964538574, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "learning_rate": 1.7776943565601046e-05, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "loss": 0.3499804735183716, + "step": 27210 + }, + { + "ce_loss": 0.028613731265068054, + "epoch": 9.076050700466977, + "step": 27210 + }, + { + "distill_loss": 0.1530584990978241, + "epoch": 9.076050700466977, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "ref_ce_loss": 0.07354908436536789, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "loss": 0.6023759245872498, + "step": 27210 + }, + { + "ce_loss": 0.03832937777042389, + "epoch": 9.076050700466977, + "step": 27210 + }, + { + "distill_loss": 0.1743897646665573, + "epoch": 9.076050700466977, + "step": 27210 + }, + { + "epoch": 9.076050700466977, + "ref_ce_loss": 0.055310387164354324, + "step": 27210 + }, + { + "epoch": 9.079386257505003, + "loss": 0.3774, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "grad_norm": 1.050165057182312, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "learning_rate": 1.764977253551776e-05, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "loss": 0.5067462921142578, + "step": 27220 + }, + { + "ce_loss": 0.02488376945257187, + "epoch": 9.079386257505003, + "step": 27220 + }, + { + "distill_loss": 0.15910297632217407, + "epoch": 9.079386257505003, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "ref_ce_loss": 0.06450691819190979, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "loss": 0.3871625065803528, + "step": 27220 + }, + { + "ce_loss": 0.023935766890645027, + "epoch": 9.079386257505003, + "step": 27220 + }, + { + "distill_loss": 0.14328233897686005, + "epoch": 9.079386257505003, + "step": 27220 + }, + { + "epoch": 9.079386257505003, + "ref_ce_loss": 0.0571419782936573, + "step": 27220 + }, + { + "epoch": 9.082721814543028, + "loss": 0.3639, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "grad_norm": 1.2589771747589111, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "learning_rate": 1.7523047749292433e-05, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "loss": 0.27285081148147583, + "step": 27230 + }, + { + "ce_loss": 0.022948715835809708, + "epoch": 9.082721814543028, + "step": 27230 + }, + { + "distill_loss": 0.16958534717559814, + "epoch": 9.082721814543028, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "ref_ce_loss": 0.0546344593167305, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "loss": 0.26805174350738525, + "step": 27230 + }, + { + "ce_loss": 0.03970019891858101, + "epoch": 9.082721814543028, + "step": 27230 + }, + { + "distill_loss": 0.16495995223522186, + "epoch": 9.082721814543028, + "step": 27230 + }, + { + "epoch": 9.082721814543028, + "ref_ce_loss": 0.044752802699804306, + "step": 27230 + }, + { + "epoch": 9.086057371581054, + "loss": 0.343, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "grad_norm": 0.9426337480545044, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "learning_rate": 1.7396769354826616e-05, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "loss": 0.2930073142051697, + "step": 27240 + }, + { + "ce_loss": 0.034419164061546326, + "epoch": 9.086057371581054, + "step": 27240 + }, + { + "distill_loss": 0.18961873650550842, + "epoch": 9.086057371581054, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "ref_ce_loss": 0.050885505974292755, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "loss": 0.36552560329437256, + "step": 27240 + }, + { + "ce_loss": 0.039030518382787704, + "epoch": 9.086057371581054, + "step": 27240 + }, + { + "distill_loss": 0.20410792529582977, + "epoch": 9.086057371581054, + "step": 27240 + }, + { + "epoch": 9.086057371581054, + "ref_ce_loss": 0.07064566016197205, + "step": 27240 + }, + { + "epoch": 9.089392928619079, + "loss": 0.3392, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "grad_norm": 1.2283823490142822, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "learning_rate": 1.7270937499500773e-05, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "loss": 0.377672016620636, + "step": 27250 + }, + { + "ce_loss": 0.039118725806474686, + "epoch": 9.089392928619079, + "step": 27250 + }, + { + "distill_loss": 0.15970379114151, + "epoch": 9.089392928619079, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "ref_ce_loss": 0.0498061366379261, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "loss": 0.26500624418258667, + "step": 27250 + }, + { + "ce_loss": 0.05562635511159897, + "epoch": 9.089392928619079, + "step": 27250 + }, + { + "distill_loss": 0.15516263246536255, + "epoch": 9.089392928619079, + "step": 27250 + }, + { + "epoch": 9.089392928619079, + "ref_ce_loss": 0.03663584962487221, + "step": 27250 + }, + { + "epoch": 9.092728485657105, + "loss": 0.3697, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "grad_norm": 1.1213407516479492, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "learning_rate": 1.7145552330174276e-05, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "loss": 0.2562912404537201, + "step": 27260 + }, + { + "ce_loss": 0.01841781474649906, + "epoch": 9.092728485657105, + "step": 27260 + }, + { + "distill_loss": 0.16973228752613068, + "epoch": 9.092728485657105, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "ref_ce_loss": 0.05428987368941307, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "loss": 0.3668363094329834, + "step": 27260 + }, + { + "ce_loss": 0.04437238350510597, + "epoch": 9.092728485657105, + "step": 27260 + }, + { + "distill_loss": 0.17178454995155334, + "epoch": 9.092728485657105, + "step": 27260 + }, + { + "epoch": 9.092728485657105, + "ref_ce_loss": 0.06676838546991348, + "step": 27260 + }, + { + "epoch": 9.09606404269513, + "loss": 0.4288, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "grad_norm": 1.3704133033752441, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "learning_rate": 1.7020613993184996e-05, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "loss": 1.1396571397781372, + "step": 27270 + }, + { + "ce_loss": 0.06560547649860382, + "epoch": 9.09606404269513, + "step": 27270 + }, + { + "distill_loss": 0.22419370710849762, + "epoch": 9.09606404269513, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "ref_ce_loss": 0.06717196851968765, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "loss": 0.32848432660102844, + "step": 27270 + }, + { + "ce_loss": 0.0432603545486927, + "epoch": 9.09606404269513, + "step": 27270 + }, + { + "distill_loss": 0.16509680449962616, + "epoch": 9.09606404269513, + "step": 27270 + }, + { + "epoch": 9.09606404269513, + "ref_ce_loss": 0.04808349534869194, + "step": 27270 + }, + { + "epoch": 9.099399599733156, + "loss": 0.3899, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "grad_norm": 1.3461713790893555, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "learning_rate": 1.68961226343495e-05, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "loss": 0.31953686475753784, + "step": 27280 + }, + { + "ce_loss": 0.013826312497258186, + "epoch": 9.099399599733156, + "step": 27280 + }, + { + "distill_loss": 0.17898410558700562, + "epoch": 9.099399599733156, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "ref_ce_loss": 0.054237280040979385, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "loss": 0.29063862562179565, + "step": 27280 + }, + { + "ce_loss": 0.025695666670799255, + "epoch": 9.099399599733156, + "step": 27280 + }, + { + "distill_loss": 0.16870594024658203, + "epoch": 9.099399599733156, + "step": 27280 + }, + { + "epoch": 9.099399599733156, + "ref_ce_loss": 0.06049703061580658, + "step": 27280 + }, + { + "epoch": 9.10273515677118, + "loss": 0.3537, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "grad_norm": 1.5573277473449707, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "learning_rate": 1.677207839896253e-05, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "loss": 0.29031074047088623, + "step": 27290 + }, + { + "ce_loss": 0.02433249168097973, + "epoch": 9.10273515677118, + "step": 27290 + }, + { + "distill_loss": 0.16359396278858185, + "epoch": 9.10273515677118, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "ref_ce_loss": 0.03717683628201485, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "loss": 0.3187103867530823, + "step": 27290 + }, + { + "ce_loss": 0.0356263630092144, + "epoch": 9.10273515677118, + "step": 27290 + }, + { + "distill_loss": 0.1854030191898346, + "epoch": 9.10273515677118, + "step": 27290 + }, + { + "epoch": 9.10273515677118, + "ref_ce_loss": 0.06397075951099396, + "step": 27290 + }, + { + "epoch": 9.106070713809206, + "loss": 0.3855, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "grad_norm": 2.3191182613372803, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "learning_rate": 1.6648481431797135e-05, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "loss": 0.3305771052837372, + "step": 27300 + }, + { + "ce_loss": 0.03626731410622597, + "epoch": 9.106070713809206, + "step": 27300 + }, + { + "distill_loss": 0.16633017361164093, + "epoch": 9.106070713809206, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "ref_ce_loss": 0.039277367293834686, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "loss": 0.25371691584587097, + "step": 27300 + }, + { + "ce_loss": 0.039535846561193466, + "epoch": 9.106070713809206, + "step": 27300 + }, + { + "distill_loss": 0.16822320222854614, + "epoch": 9.106070713809206, + "step": 27300 + }, + { + "epoch": 9.106070713809206, + "ref_ce_loss": 0.04572812095284462, + "step": 27300 + }, + { + "epoch": 9.10940627084723, + "loss": 0.3495, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "grad_norm": 0.8396355509757996, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "learning_rate": 1.652533187710419e-05, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "loss": 0.3427233397960663, + "step": 27310 + }, + { + "ce_loss": 0.028757216408848763, + "epoch": 9.10940627084723, + "step": 27310 + }, + { + "distill_loss": 0.1467723846435547, + "epoch": 9.10940627084723, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "ref_ce_loss": 0.07151523232460022, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "loss": 0.41144073009490967, + "step": 27310 + }, + { + "ce_loss": 0.049031734466552734, + "epoch": 9.10940627084723, + "step": 27310 + }, + { + "distill_loss": 0.20296907424926758, + "epoch": 9.10940627084723, + "step": 27310 + }, + { + "epoch": 9.10940627084723, + "ref_ce_loss": 0.06365089118480682, + "step": 27310 + }, + { + "epoch": 9.112741827885257, + "loss": 0.3414, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "grad_norm": 1.8548704385757446, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "learning_rate": 1.6402629878612586e-05, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "loss": 0.33567383885383606, + "step": 27320 + }, + { + "ce_loss": 0.03355925902724266, + "epoch": 9.112741827885257, + "step": 27320 + }, + { + "distill_loss": 0.1899872124195099, + "epoch": 9.112741827885257, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "ref_ce_loss": 0.06817373633384705, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "loss": 0.3683999180793762, + "step": 27320 + }, + { + "ce_loss": 0.0811447873711586, + "epoch": 9.112741827885257, + "step": 27320 + }, + { + "distill_loss": 0.1963319629430771, + "epoch": 9.112741827885257, + "step": 27320 + }, + { + "epoch": 9.112741827885257, + "ref_ce_loss": 0.07135666906833649, + "step": 27320 + }, + { + "epoch": 9.116077384923281, + "loss": 0.3904, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "grad_norm": 0.9724132418632507, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "learning_rate": 1.6280375579528663e-05, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "loss": 0.29246678948402405, + "step": 27330 + }, + { + "ce_loss": 0.035326674580574036, + "epoch": 9.116077384923281, + "step": 27330 + }, + { + "distill_loss": 0.1687965989112854, + "epoch": 9.116077384923281, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "ref_ce_loss": 0.05702679604291916, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "loss": 0.39776480197906494, + "step": 27330 + }, + { + "ce_loss": 0.053400568664073944, + "epoch": 9.116077384923281, + "step": 27330 + }, + { + "distill_loss": 0.20380781590938568, + "epoch": 9.116077384923281, + "step": 27330 + }, + { + "epoch": 9.116077384923281, + "ref_ce_loss": 0.046494729816913605, + "step": 27330 + }, + { + "epoch": 9.119412941961308, + "loss": 0.3396, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "grad_norm": 1.214045763015747, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "learning_rate": 1.6158569122536414e-05, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "loss": 0.37317508459091187, + "step": 27340 + }, + { + "ce_loss": 0.0343099981546402, + "epoch": 9.119412941961308, + "step": 27340 + }, + { + "distill_loss": 0.16131190955638885, + "epoch": 9.119412941961308, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "ref_ce_loss": 0.042967405170202255, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "loss": 0.3499917685985565, + "step": 27340 + }, + { + "ce_loss": 0.04128754511475563, + "epoch": 9.119412941961308, + "step": 27340 + }, + { + "distill_loss": 0.1890879124403, + "epoch": 9.119412941961308, + "step": 27340 + }, + { + "epoch": 9.119412941961308, + "ref_ce_loss": 0.0573907345533371, + "step": 27340 + }, + { + "epoch": 9.122748498999332, + "loss": 0.3856, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "grad_norm": 1.5197536945343018, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "learning_rate": 1.6037210649797063e-05, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "loss": 0.3776387572288513, + "step": 27350 + }, + { + "ce_loss": 0.08085337281227112, + "epoch": 9.122748498999332, + "step": 27350 + }, + { + "distill_loss": 0.17287683486938477, + "epoch": 9.122748498999332, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "ref_ce_loss": 0.06616152077913284, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "loss": 0.373631089925766, + "step": 27350 + }, + { + "ce_loss": 0.030946599319577217, + "epoch": 9.122748498999332, + "step": 27350 + }, + { + "distill_loss": 0.19271759688854218, + "epoch": 9.122748498999332, + "step": 27350 + }, + { + "epoch": 9.122748498999332, + "ref_ce_loss": 0.07093948125839233, + "step": 27350 + }, + { + "epoch": 9.126084056037358, + "loss": 0.3804, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "grad_norm": 1.1146163940429688, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "learning_rate": 1.5916300302948905e-05, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "loss": 0.2942475378513336, + "step": 27360 + }, + { + "ce_loss": 0.029155785217881203, + "epoch": 9.126084056037358, + "step": 27360 + }, + { + "distill_loss": 0.17508697509765625, + "epoch": 9.126084056037358, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "ref_ce_loss": 0.0685693621635437, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "loss": 0.3584308624267578, + "step": 27360 + }, + { + "ce_loss": 0.02746543474495411, + "epoch": 9.126084056037358, + "step": 27360 + }, + { + "distill_loss": 0.19145989418029785, + "epoch": 9.126084056037358, + "step": 27360 + }, + { + "epoch": 9.126084056037358, + "ref_ce_loss": 0.06303809583187103, + "step": 27360 + }, + { + "epoch": 9.129419613075383, + "loss": 0.3486, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "grad_norm": 0.8257373571395874, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "learning_rate": 1.579583822310746e-05, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "loss": 0.27437683939933777, + "step": 27370 + }, + { + "ce_loss": 0.018562039360404015, + "epoch": 9.129419613075383, + "step": 27370 + }, + { + "distill_loss": 0.14559762179851532, + "epoch": 9.129419613075383, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "ref_ce_loss": 0.04495834559202194, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "loss": 0.36754468083381653, + "step": 27370 + }, + { + "ce_loss": 0.05180488899350166, + "epoch": 9.129419613075383, + "step": 27370 + }, + { + "distill_loss": 0.21629267930984497, + "epoch": 9.129419613075383, + "step": 27370 + }, + { + "epoch": 9.129419613075383, + "ref_ce_loss": 0.07912370562553406, + "step": 27370 + }, + { + "epoch": 9.132755170113409, + "loss": 0.3541, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "grad_norm": 1.5731775760650635, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "learning_rate": 1.567582455086494e-05, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "loss": 0.3551574945449829, + "step": 27380 + }, + { + "ce_loss": 0.03758762776851654, + "epoch": 9.132755170113409, + "step": 27380 + }, + { + "distill_loss": 0.15888771414756775, + "epoch": 9.132755170113409, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "ref_ce_loss": 0.06447268277406693, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "loss": 0.3179193139076233, + "step": 27380 + }, + { + "ce_loss": 0.048470109701156616, + "epoch": 9.132755170113409, + "step": 27380 + }, + { + "distill_loss": 0.15303857624530792, + "epoch": 9.132755170113409, + "step": 27380 + }, + { + "epoch": 9.132755170113409, + "ref_ce_loss": 0.05130033940076828, + "step": 27380 + }, + { + "epoch": 9.136090727151434, + "loss": 0.3701, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "grad_norm": 1.0357892513275146, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "learning_rate": 1.5556259426290086e-05, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "loss": 0.29864880442619324, + "step": 27390 + }, + { + "ce_loss": 0.05004766955971718, + "epoch": 9.136090727151434, + "step": 27390 + }, + { + "distill_loss": 0.19834557175636292, + "epoch": 9.136090727151434, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "ref_ce_loss": 0.04956059902906418, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "loss": 0.24494145810604095, + "step": 27390 + }, + { + "ce_loss": 0.03270229697227478, + "epoch": 9.136090727151434, + "step": 27390 + }, + { + "distill_loss": 0.1631958782672882, + "epoch": 9.136090727151434, + "step": 27390 + }, + { + "epoch": 9.136090727151434, + "ref_ce_loss": 0.04877462610602379, + "step": 27390 + }, + { + "epoch": 9.13942628418946, + "loss": 0.3564, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "grad_norm": 0.8601231575012207, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "learning_rate": 1.543714298892831e-05, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "loss": 0.2472851574420929, + "step": 27400 + }, + { + "ce_loss": 0.02401360310614109, + "epoch": 9.13942628418946, + "step": 27400 + }, + { + "distill_loss": 0.14570125937461853, + "epoch": 9.13942628418946, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "ref_ce_loss": 0.05537709966301918, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "loss": 0.3812285363674164, + "step": 27400 + }, + { + "ce_loss": 0.06255284696817398, + "epoch": 9.13942628418946, + "step": 27400 + }, + { + "distill_loss": 0.2078147828578949, + "epoch": 9.13942628418946, + "step": 27400 + }, + { + "epoch": 9.13942628418946, + "ref_ce_loss": 0.0539514385163784, + "step": 27400 + }, + { + "epoch": 9.142761841227484, + "loss": 0.3798, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "grad_norm": 1.2774498462677002, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "learning_rate": 1.5318475377801422e-05, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "loss": 0.3671090304851532, + "step": 27410 + }, + { + "ce_loss": 0.06406539678573608, + "epoch": 9.142761841227484, + "step": 27410 + }, + { + "distill_loss": 0.1353074312210083, + "epoch": 9.142761841227484, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "ref_ce_loss": 0.07601393014192581, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "loss": 0.2966614365577698, + "step": 27410 + }, + { + "ce_loss": 0.03425230830907822, + "epoch": 9.142761841227484, + "step": 27410 + }, + { + "distill_loss": 0.16910341382026672, + "epoch": 9.142761841227484, + "step": 27410 + }, + { + "epoch": 9.142761841227484, + "ref_ce_loss": 0.06891260296106339, + "step": 27410 + }, + { + "epoch": 9.14609739826551, + "loss": 0.3266, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "grad_norm": 1.0859674215316772, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "learning_rate": 1.5200256731407214e-05, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "loss": 0.42090123891830444, + "step": 27420 + }, + { + "ce_loss": 0.02037889137864113, + "epoch": 9.14609739826551, + "step": 27420 + }, + { + "distill_loss": 0.15968488156795502, + "epoch": 9.14609739826551, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "ref_ce_loss": 0.0486813485622406, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "loss": 0.3073991537094116, + "step": 27420 + }, + { + "ce_loss": 0.028052043169736862, + "epoch": 9.14609739826551, + "step": 27420 + }, + { + "distill_loss": 0.18357442319393158, + "epoch": 9.14609739826551, + "step": 27420 + }, + { + "epoch": 9.14609739826551, + "ref_ce_loss": 0.06518913060426712, + "step": 27420 + }, + { + "epoch": 9.149432955303535, + "loss": 0.3282, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "grad_norm": 1.4294742345809937, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "learning_rate": 1.5082487187719495e-05, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "loss": 0.2739667594432831, + "step": 27430 + }, + { + "ce_loss": 0.04306583106517792, + "epoch": 9.149432955303535, + "step": 27430 + }, + { + "distill_loss": 0.16011951863765717, + "epoch": 9.149432955303535, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "ref_ce_loss": 0.070659339427948, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "loss": 1.037210464477539, + "step": 27430 + }, + { + "ce_loss": 0.06925816833972931, + "epoch": 9.149432955303535, + "step": 27430 + }, + { + "distill_loss": 0.1676364243030548, + "epoch": 9.149432955303535, + "step": 27430 + }, + { + "epoch": 9.149432955303535, + "ref_ce_loss": 0.06458926945924759, + "step": 27430 + }, + { + "epoch": 9.152768512341561, + "loss": 0.3814, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "grad_norm": 0.8522936105728149, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "learning_rate": 1.4965166884188097e-05, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "loss": 0.29712921380996704, + "step": 27440 + }, + { + "ce_loss": 0.05974900349974632, + "epoch": 9.152768512341561, + "step": 27440 + }, + { + "distill_loss": 0.17028288543224335, + "epoch": 9.152768512341561, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "ref_ce_loss": 0.06694556027650833, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "loss": 0.33882614970207214, + "step": 27440 + }, + { + "ce_loss": 0.0511908158659935, + "epoch": 9.152768512341561, + "step": 27440 + }, + { + "distill_loss": 0.194253608584404, + "epoch": 9.152768512341561, + "step": 27440 + }, + { + "epoch": 9.152768512341561, + "ref_ce_loss": 0.06548138707876205, + "step": 27440 + }, + { + "epoch": 9.156104069379586, + "loss": 0.3127, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "grad_norm": 0.9218608140945435, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "learning_rate": 1.4848295957738467e-05, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "loss": 0.35625752806663513, + "step": 27450 + }, + { + "ce_loss": 0.06627841293811798, + "epoch": 9.156104069379586, + "step": 27450 + }, + { + "distill_loss": 0.18910615146160126, + "epoch": 9.156104069379586, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "ref_ce_loss": 0.07055387645959854, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "loss": 0.5226238369941711, + "step": 27450 + }, + { + "ce_loss": 0.05875205248594284, + "epoch": 9.156104069379586, + "step": 27450 + }, + { + "distill_loss": 0.19257190823554993, + "epoch": 9.156104069379586, + "step": 27450 + }, + { + "epoch": 9.156104069379586, + "ref_ce_loss": 0.07424885034561157, + "step": 27450 + }, + { + "epoch": 9.159439626417612, + "loss": 0.3483, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "grad_norm": 1.1902096271514893, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "learning_rate": 1.4731874544771452e-05, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "loss": 0.31832242012023926, + "step": 27460 + }, + { + "ce_loss": 0.03065738081932068, + "epoch": 9.159439626417612, + "step": 27460 + }, + { + "distill_loss": 0.1613398790359497, + "epoch": 9.159439626417612, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "ref_ce_loss": 0.05969864875078201, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "loss": 0.3331111967563629, + "step": 27460 + }, + { + "ce_loss": 0.0514845997095108, + "epoch": 9.159439626417612, + "step": 27460 + }, + { + "distill_loss": 0.19371838867664337, + "epoch": 9.159439626417612, + "step": 27460 + }, + { + "epoch": 9.159439626417612, + "ref_ce_loss": 0.06049351766705513, + "step": 27460 + }, + { + "epoch": 9.162775183455636, + "loss": 0.3752, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "grad_norm": 4.025130748748779, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "learning_rate": 1.4615902781163382e-05, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "loss": 0.3633132874965668, + "step": 27470 + }, + { + "ce_loss": 0.03981828689575195, + "epoch": 9.162775183455636, + "step": 27470 + }, + { + "distill_loss": 0.14197511970996857, + "epoch": 9.162775183455636, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "ref_ce_loss": 0.0731300413608551, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "loss": 0.31256744265556335, + "step": 27470 + }, + { + "ce_loss": 0.0272130835801363, + "epoch": 9.162775183455636, + "step": 27470 + }, + { + "distill_loss": 0.18780873715877533, + "epoch": 9.162775183455636, + "step": 27470 + }, + { + "epoch": 9.162775183455636, + "ref_ce_loss": 0.06526809185743332, + "step": 27470 + }, + { + "epoch": 9.166110740493663, + "loss": 0.3465, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "grad_norm": 0.8970112204551697, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "learning_rate": 1.4500380802265856e-05, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "loss": 0.3625814616680145, + "step": 27480 + }, + { + "ce_loss": 0.018771648406982422, + "epoch": 9.166110740493663, + "step": 27480 + }, + { + "distill_loss": 0.16558365523815155, + "epoch": 9.166110740493663, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "ref_ce_loss": 0.0564962700009346, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "loss": 0.31672704219818115, + "step": 27480 + }, + { + "ce_loss": 0.030506962910294533, + "epoch": 9.166110740493663, + "step": 27480 + }, + { + "distill_loss": 0.1810687929391861, + "epoch": 9.166110740493663, + "step": 27480 + }, + { + "epoch": 9.166110740493663, + "ref_ce_loss": 0.07672914117574692, + "step": 27480 + }, + { + "epoch": 9.169446297531687, + "loss": 0.3435, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "grad_norm": 0.8674625158309937, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "learning_rate": 1.4385308742905423e-05, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "loss": 0.3576408922672272, + "step": 27490 + }, + { + "ce_loss": 0.03308350220322609, + "epoch": 9.169446297531687, + "step": 27490 + }, + { + "distill_loss": 0.18558846414089203, + "epoch": 9.169446297531687, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "ref_ce_loss": 0.07912314683198929, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "loss": 0.41283512115478516, + "step": 27490 + }, + { + "ce_loss": 0.0438760407269001, + "epoch": 9.169446297531687, + "step": 27490 + }, + { + "distill_loss": 0.19519221782684326, + "epoch": 9.169446297531687, + "step": 27490 + }, + { + "epoch": 9.169446297531687, + "ref_ce_loss": 0.07502762228250504, + "step": 27490 + }, + { + "epoch": 9.172781854569713, + "loss": 0.3492, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "grad_norm": 1.755750060081482, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "learning_rate": 1.42706867373835e-05, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "loss": 0.3151405155658722, + "step": 27500 + }, + { + "ce_loss": 0.05989915877580643, + "epoch": 9.172781854569713, + "step": 27500 + }, + { + "distill_loss": 0.16233427822589874, + "epoch": 9.172781854569713, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "ref_ce_loss": 0.0649150013923645, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "loss": 0.5010256767272949, + "step": 27500 + }, + { + "ce_loss": 0.060962971299886703, + "epoch": 9.172781854569713, + "step": 27500 + }, + { + "distill_loss": 0.2040940821170807, + "epoch": 9.172781854569713, + "step": 27500 + }, + { + "epoch": 9.172781854569713, + "ref_ce_loss": 0.04863875359296799, + "step": 27500 + }, + { + "epoch": 9.176117411607738, + "loss": 0.3409, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "grad_norm": 1.3543940782546997, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "learning_rate": 1.4156514919476271e-05, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "loss": 0.3677135407924652, + "step": 27510 + }, + { + "ce_loss": 0.04805472865700722, + "epoch": 9.176117411607738, + "step": 27510 + }, + { + "distill_loss": 0.20692524313926697, + "epoch": 9.176117411607738, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "ref_ce_loss": 0.055242788046598434, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "loss": 0.44220152497291565, + "step": 27510 + }, + { + "ce_loss": 0.04361943155527115, + "epoch": 9.176117411607738, + "step": 27510 + }, + { + "distill_loss": 0.193200945854187, + "epoch": 9.176117411607738, + "step": 27510 + }, + { + "epoch": 9.176117411607738, + "ref_ce_loss": 0.055958181619644165, + "step": 27510 + }, + { + "epoch": 9.179452968645764, + "loss": 0.3342, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "grad_norm": 1.265563726425171, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "learning_rate": 1.4042793422434707e-05, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "loss": 0.44008785486221313, + "step": 27520 + }, + { + "ce_loss": 0.02725176140666008, + "epoch": 9.179452968645764, + "step": 27520 + }, + { + "distill_loss": 0.1367584764957428, + "epoch": 9.179452968645764, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "ref_ce_loss": 0.0530785396695137, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "loss": 0.36948102712631226, + "step": 27520 + }, + { + "ce_loss": 0.048951759934425354, + "epoch": 9.179452968645764, + "step": 27520 + }, + { + "distill_loss": 0.20727337896823883, + "epoch": 9.179452968645764, + "step": 27520 + }, + { + "epoch": 9.179452968645764, + "ref_ce_loss": 0.05927341803908348, + "step": 27520 + }, + { + "epoch": 9.182788525683788, + "loss": 0.3524, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "grad_norm": 1.4308565855026245, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "learning_rate": 1.3929522378983928e-05, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "loss": 0.41626620292663574, + "step": 27530 + }, + { + "ce_loss": 0.03843936324119568, + "epoch": 9.182788525683788, + "step": 27530 + }, + { + "distill_loss": 0.18540190160274506, + "epoch": 9.182788525683788, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "ref_ce_loss": 0.061426643282175064, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "loss": 0.2769725024700165, + "step": 27530 + }, + { + "ce_loss": 0.03272394835948944, + "epoch": 9.182788525683788, + "step": 27530 + }, + { + "distill_loss": 0.1454697698354721, + "epoch": 9.182788525683788, + "step": 27530 + }, + { + "epoch": 9.182788525683788, + "ref_ce_loss": 0.06404945999383926, + "step": 27530 + }, + { + "epoch": 9.186124082721815, + "loss": 0.368, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "grad_norm": 1.2058454751968384, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "learning_rate": 1.3816701921323428e-05, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "loss": 0.43945053219795227, + "step": 27540 + }, + { + "ce_loss": 0.050008222460746765, + "epoch": 9.186124082721815, + "step": 27540 + }, + { + "distill_loss": 0.1664680391550064, + "epoch": 9.186124082721815, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "ref_ce_loss": 0.0641934797167778, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "loss": 0.3098108172416687, + "step": 27540 + }, + { + "ce_loss": 0.05395625904202461, + "epoch": 9.186124082721815, + "step": 27540 + }, + { + "distill_loss": 0.14219501614570618, + "epoch": 9.186124082721815, + "step": 27540 + }, + { + "epoch": 9.186124082721815, + "ref_ce_loss": 0.06553469598293304, + "step": 27540 + }, + { + "epoch": 9.18945963975984, + "loss": 0.3232, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "grad_norm": 1.7351081371307373, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "learning_rate": 1.3704332181126811e-05, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "loss": 0.3305128216743469, + "step": 27550 + }, + { + "ce_loss": 0.052353035658597946, + "epoch": 9.18945963975984, + "step": 27550 + }, + { + "distill_loss": 0.19812655448913574, + "epoch": 9.18945963975984, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "ref_ce_loss": 0.05271931365132332, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "loss": 0.36235523223876953, + "step": 27550 + }, + { + "ce_loss": 0.07153531163930893, + "epoch": 9.18945963975984, + "step": 27550 + }, + { + "distill_loss": 0.21841926872730255, + "epoch": 9.18945963975984, + "step": 27550 + }, + { + "epoch": 9.18945963975984, + "ref_ce_loss": 0.06689158082008362, + "step": 27550 + }, + { + "epoch": 9.192795196797865, + "loss": 0.3532, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "grad_norm": 1.0499604940414429, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "learning_rate": 1.3592413289541661e-05, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "loss": 0.4369659125804901, + "step": 27560 + }, + { + "ce_loss": 0.06519733369350433, + "epoch": 9.192795196797865, + "step": 27560 + }, + { + "distill_loss": 0.23988565802574158, + "epoch": 9.192795196797865, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "ref_ce_loss": 0.09593500941991806, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "loss": 0.2988507151603699, + "step": 27560 + }, + { + "ce_loss": 0.03475344553589821, + "epoch": 9.192795196797865, + "step": 27560 + }, + { + "distill_loss": 0.17159003019332886, + "epoch": 9.192795196797865, + "step": 27560 + }, + { + "epoch": 9.192795196797865, + "ref_ce_loss": 0.06151943281292915, + "step": 27560 + }, + { + "epoch": 9.19613075383589, + "loss": 0.3207, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "grad_norm": 1.0811963081359863, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "learning_rate": 1.3480945377189446e-05, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "loss": 0.4045826196670532, + "step": 27570 + }, + { + "ce_loss": 0.031967297196388245, + "epoch": 9.19613075383589, + "step": 27570 + }, + { + "distill_loss": 0.1706383228302002, + "epoch": 9.19613075383589, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "ref_ce_loss": 0.05827203392982483, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "loss": 0.2599678635597229, + "step": 27570 + }, + { + "ce_loss": 0.038392551243305206, + "epoch": 9.19613075383589, + "step": 27570 + }, + { + "distill_loss": 0.12934038043022156, + "epoch": 9.19613075383589, + "step": 27570 + }, + { + "epoch": 9.19613075383589, + "ref_ce_loss": 0.06656457483768463, + "step": 27570 + }, + { + "epoch": 9.199466310873916, + "loss": 0.3678, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "grad_norm": 1.811726450920105, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "learning_rate": 1.3369928574165124e-05, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "loss": 0.5186915993690491, + "step": 27580 + }, + { + "ce_loss": 0.0656609907746315, + "epoch": 9.199466310873916, + "step": 27580 + }, + { + "distill_loss": 0.22513791918754578, + "epoch": 9.199466310873916, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "ref_ce_loss": 0.07715293765068054, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "loss": 0.2685270309448242, + "step": 27580 + }, + { + "ce_loss": 0.030084757134318352, + "epoch": 9.199466310873916, + "step": 27580 + }, + { + "distill_loss": 0.16327223181724548, + "epoch": 9.199466310873916, + "step": 27580 + }, + { + "epoch": 9.199466310873916, + "ref_ce_loss": 0.0434606596827507, + "step": 27580 + }, + { + "epoch": 9.20280186791194, + "loss": 0.3612, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "grad_norm": 1.1356041431427002, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "learning_rate": 1.325936301003723e-05, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "loss": 0.33782315254211426, + "step": 27590 + }, + { + "ce_loss": 0.03354613110423088, + "epoch": 9.20280186791194, + "step": 27590 + }, + { + "distill_loss": 0.16796359419822693, + "epoch": 9.20280186791194, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "ref_ce_loss": 0.05903716757893562, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "loss": 0.259998619556427, + "step": 27590 + }, + { + "ce_loss": 0.033992376178503036, + "epoch": 9.20280186791194, + "step": 27590 + }, + { + "distill_loss": 0.18147948384284973, + "epoch": 9.20280186791194, + "step": 27590 + }, + { + "epoch": 9.20280186791194, + "ref_ce_loss": 0.044291913509368896, + "step": 27590 + }, + { + "epoch": 9.206137424949967, + "loss": 0.3284, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "grad_norm": 1.02849280834198, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "learning_rate": 1.3149248813847737e-05, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "loss": 0.29262685775756836, + "step": 27600 + }, + { + "ce_loss": 0.045261502265930176, + "epoch": 9.206137424949967, + "step": 27600 + }, + { + "distill_loss": 0.154446542263031, + "epoch": 9.206137424949967, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "ref_ce_loss": 0.060710638761520386, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "loss": 0.42080560326576233, + "step": 27600 + }, + { + "ce_loss": 0.0515153594315052, + "epoch": 9.206137424949967, + "step": 27600 + }, + { + "distill_loss": 0.22277718782424927, + "epoch": 9.206137424949967, + "step": 27600 + }, + { + "epoch": 9.206137424949967, + "ref_ce_loss": 0.05690597742795944, + "step": 27600 + }, + { + "epoch": 9.209472981987991, + "loss": 0.3054, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "grad_norm": 1.1911672353744507, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "learning_rate": 1.30395861141118e-05, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "loss": 0.3175136148929596, + "step": 27610 + }, + { + "ce_loss": 0.036621760576963425, + "epoch": 9.209472981987991, + "step": 27610 + }, + { + "distill_loss": 0.1766699254512787, + "epoch": 9.209472981987991, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "ref_ce_loss": 0.06541144847869873, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "loss": 0.28621190786361694, + "step": 27610 + }, + { + "ce_loss": 0.04701051861047745, + "epoch": 9.209472981987991, + "step": 27610 + }, + { + "distill_loss": 0.16483399271965027, + "epoch": 9.209472981987991, + "step": 27610 + }, + { + "epoch": 9.209472981987991, + "ref_ce_loss": 0.05313900113105774, + "step": 27610 + }, + { + "epoch": 9.212808539026017, + "loss": 0.3519, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "grad_norm": 1.150766372680664, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "learning_rate": 1.29303750388174e-05, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "loss": 0.39144226908683777, + "step": 27620 + }, + { + "ce_loss": 0.03537900000810623, + "epoch": 9.212808539026017, + "step": 27620 + }, + { + "distill_loss": 0.15409021079540253, + "epoch": 9.212808539026017, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "ref_ce_loss": 0.058260228484869, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "loss": 0.5103629231452942, + "step": 27620 + }, + { + "ce_loss": 0.03485684096813202, + "epoch": 9.212808539026017, + "step": 27620 + }, + { + "distill_loss": 0.2051319181919098, + "epoch": 9.212808539026017, + "step": 27620 + }, + { + "epoch": 9.212808539026017, + "ref_ce_loss": 0.05551000311970711, + "step": 27620 + }, + { + "epoch": 9.216144096064042, + "loss": 0.3647, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "grad_norm": 1.2545127868652344, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "learning_rate": 1.2821615715425817e-05, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "loss": 0.21375618875026703, + "step": 27630 + }, + { + "ce_loss": 0.02147822454571724, + "epoch": 9.216144096064042, + "step": 27630 + }, + { + "distill_loss": 0.13803425431251526, + "epoch": 9.216144096064042, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "ref_ce_loss": 0.053897660225629807, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "loss": 0.3832044005393982, + "step": 27630 + }, + { + "ce_loss": 0.07676784694194794, + "epoch": 9.216144096064042, + "step": 27630 + }, + { + "distill_loss": 0.18233522772789001, + "epoch": 9.216144096064042, + "step": 27630 + }, + { + "epoch": 9.216144096064042, + "ref_ce_loss": 0.06758108735084534, + "step": 27630 + }, + { + "epoch": 9.219479653102068, + "loss": 0.3424, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "grad_norm": 0.9458558559417725, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "learning_rate": 1.271330827087085e-05, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "loss": 0.2649664878845215, + "step": 27640 + }, + { + "ce_loss": 0.041778381913900375, + "epoch": 9.219479653102068, + "step": 27640 + }, + { + "distill_loss": 0.17980529367923737, + "epoch": 9.219479653102068, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "ref_ce_loss": 0.04321242496371269, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "loss": 0.40541356801986694, + "step": 27640 + }, + { + "ce_loss": 0.03248671814799309, + "epoch": 9.219479653102068, + "step": 27640 + }, + { + "distill_loss": 0.16780821979045868, + "epoch": 9.219479653102068, + "step": 27640 + }, + { + "epoch": 9.219479653102068, + "ref_ce_loss": 0.038705579936504364, + "step": 27640 + }, + { + "epoch": 9.222815210140093, + "loss": 0.3235, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "grad_norm": 2.606320858001709, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "learning_rate": 1.2605452831558896e-05, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "loss": 0.4550734758377075, + "step": 27650 + }, + { + "ce_loss": 0.09363550692796707, + "epoch": 9.222815210140093, + "step": 27650 + }, + { + "distill_loss": 0.20949184894561768, + "epoch": 9.222815210140093, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "ref_ce_loss": 0.07188888639211655, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "loss": 0.3436967432498932, + "step": 27650 + }, + { + "ce_loss": 0.04648522660136223, + "epoch": 9.222815210140093, + "step": 27650 + }, + { + "distill_loss": 0.15692740678787231, + "epoch": 9.222815210140093, + "step": 27650 + }, + { + "epoch": 9.222815210140093, + "ref_ce_loss": 0.06392422318458557, + "step": 27650 + }, + { + "epoch": 9.226150767178119, + "loss": 0.3399, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "grad_norm": 1.5653630495071411, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "learning_rate": 1.2498049523368816e-05, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "loss": 0.3628976345062256, + "step": 27660 + }, + { + "ce_loss": 0.06816162168979645, + "epoch": 9.226150767178119, + "step": 27660 + }, + { + "distill_loss": 0.1749858856201172, + "epoch": 9.226150767178119, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "ref_ce_loss": 0.06702398508787155, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "loss": 0.2408471554517746, + "step": 27660 + }, + { + "ce_loss": 0.03688124194741249, + "epoch": 9.226150767178119, + "step": 27660 + }, + { + "distill_loss": 0.1462167203426361, + "epoch": 9.226150767178119, + "step": 27660 + }, + { + "epoch": 9.226150767178119, + "ref_ce_loss": 0.0576145276427269, + "step": 27660 + }, + { + "epoch": 9.229486324216143, + "loss": 0.3421, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "grad_norm": 1.1789419651031494, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "learning_rate": 1.2391098471651896e-05, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "loss": 0.33586499094963074, + "step": 27670 + }, + { + "ce_loss": 0.05634840950369835, + "epoch": 9.229486324216143, + "step": 27670 + }, + { + "distill_loss": 0.1920376718044281, + "epoch": 9.229486324216143, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "ref_ce_loss": 0.06841642409563065, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "loss": 0.4046776592731476, + "step": 27670 + }, + { + "ce_loss": 0.0436110757291317, + "epoch": 9.229486324216143, + "step": 27670 + }, + { + "distill_loss": 0.17451398074626923, + "epoch": 9.229486324216143, + "step": 27670 + }, + { + "epoch": 9.229486324216143, + "ref_ce_loss": 0.06450141966342926, + "step": 27670 + }, + { + "epoch": 9.23282188125417, + "loss": 0.327, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "grad_norm": 1.0851166248321533, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "learning_rate": 1.2284599801231489e-05, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "loss": 0.2801929712295532, + "step": 27680 + }, + { + "ce_loss": 0.03620101511478424, + "epoch": 9.23282188125417, + "step": 27680 + }, + { + "distill_loss": 0.18607118725776672, + "epoch": 9.23282188125417, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "ref_ce_loss": 0.04491456225514412, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "loss": 0.38286706805229187, + "step": 27680 + }, + { + "ce_loss": 0.0755334123969078, + "epoch": 9.23282188125417, + "step": 27680 + }, + { + "distill_loss": 0.2033005952835083, + "epoch": 9.23282188125417, + "step": 27680 + }, + { + "epoch": 9.23282188125417, + "ref_ce_loss": 0.07997366786003113, + "step": 27680 + }, + { + "epoch": 9.236157438292194, + "loss": 0.3318, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "grad_norm": 0.9657601714134216, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "learning_rate": 1.2178553636403101e-05, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "loss": 0.8263682126998901, + "step": 27690 + }, + { + "ce_loss": 0.07217461615800858, + "epoch": 9.236157438292194, + "step": 27690 + }, + { + "distill_loss": 0.17183834314346313, + "epoch": 9.236157438292194, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "ref_ce_loss": 0.06462493538856506, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "loss": 0.3429591655731201, + "step": 27690 + }, + { + "ce_loss": 0.04136383906006813, + "epoch": 9.236157438292194, + "step": 27690 + }, + { + "distill_loss": 0.18173544108867645, + "epoch": 9.236157438292194, + "step": 27690 + }, + { + "epoch": 9.236157438292194, + "ref_ce_loss": 0.05896923691034317, + "step": 27690 + }, + { + "epoch": 9.23949299533022, + "loss": 0.3641, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "grad_norm": 1.0028430223464966, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "learning_rate": 1.207296010093386e-05, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "loss": 0.37061363458633423, + "step": 27700 + }, + { + "ce_loss": 0.030712340027093887, + "epoch": 9.23949299533022, + "step": 27700 + }, + { + "distill_loss": 0.1883632242679596, + "epoch": 9.23949299533022, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "ref_ce_loss": 0.07665921002626419, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "loss": 0.3146851658821106, + "step": 27700 + }, + { + "ce_loss": 0.055054206401109695, + "epoch": 9.23949299533022, + "step": 27700 + }, + { + "distill_loss": 0.17789779603481293, + "epoch": 9.23949299533022, + "step": 27700 + }, + { + "epoch": 9.23949299533022, + "ref_ce_loss": 0.059667814522981644, + "step": 27700 + }, + { + "epoch": 9.242828552368245, + "loss": 0.3384, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "grad_norm": 1.1923667192459106, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "learning_rate": 1.1967819318062835e-05, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "loss": 0.2766267955303192, + "step": 27710 + }, + { + "ce_loss": 0.05385156348347664, + "epoch": 9.242828552368245, + "step": 27710 + }, + { + "distill_loss": 0.14416009187698364, + "epoch": 9.242828552368245, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "ref_ce_loss": 0.0671229436993599, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "loss": 0.3347581923007965, + "step": 27710 + }, + { + "ce_loss": 0.05921385437250137, + "epoch": 9.242828552368245, + "step": 27710 + }, + { + "distill_loss": 0.18549509346485138, + "epoch": 9.242828552368245, + "step": 27710 + }, + { + "epoch": 9.242828552368245, + "ref_ce_loss": 0.0644620805978775, + "step": 27710 + }, + { + "epoch": 9.246164109406271, + "loss": 0.3363, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "grad_norm": 1.4752428531646729, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "learning_rate": 1.1863131410500706e-05, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "loss": 0.45307832956314087, + "step": 27720 + }, + { + "ce_loss": 0.06924276053905487, + "epoch": 9.246164109406271, + "step": 27720 + }, + { + "distill_loss": 0.18389859795570374, + "epoch": 9.246164109406271, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "ref_ce_loss": 0.06640004366636276, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "loss": 0.3857943117618561, + "step": 27720 + }, + { + "ce_loss": 0.05119406431913376, + "epoch": 9.246164109406271, + "step": 27720 + }, + { + "distill_loss": 0.21899224817752838, + "epoch": 9.246164109406271, + "step": 27720 + }, + { + "epoch": 9.246164109406271, + "ref_ce_loss": 0.06401392817497253, + "step": 27720 + }, + { + "epoch": 9.249499666444295, + "loss": 0.3581, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "grad_norm": 0.9835862517356873, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "learning_rate": 1.1758896500429428e-05, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "loss": 0.29481619596481323, + "step": 27730 + }, + { + "ce_loss": 0.050965506583452225, + "epoch": 9.249499666444295, + "step": 27730 + }, + { + "distill_loss": 0.15273383259773254, + "epoch": 9.249499666444295, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "ref_ce_loss": 0.0674981027841568, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "loss": 0.3622666895389557, + "step": 27730 + }, + { + "ce_loss": 0.034598208963871, + "epoch": 9.249499666444295, + "step": 27730 + }, + { + "distill_loss": 0.18676534295082092, + "epoch": 9.249499666444295, + "step": 27730 + }, + { + "epoch": 9.249499666444295, + "ref_ce_loss": 0.06467654556035995, + "step": 27730 + }, + { + "epoch": 9.252835223482322, + "loss": 0.3628, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "grad_norm": 0.8673005104064941, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "learning_rate": 1.1655114709502447e-05, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "loss": 0.29098695516586304, + "step": 27740 + }, + { + "ce_loss": 0.02971721440553665, + "epoch": 9.252835223482322, + "step": 27740 + }, + { + "distill_loss": 0.15608163177967072, + "epoch": 9.252835223482322, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "ref_ce_loss": 0.052347734570503235, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "loss": 0.24176278710365295, + "step": 27740 + }, + { + "ce_loss": 0.024755051359534264, + "epoch": 9.252835223482322, + "step": 27740 + }, + { + "distill_loss": 0.12914715707302094, + "epoch": 9.252835223482322, + "step": 27740 + }, + { + "epoch": 9.252835223482322, + "ref_ce_loss": 0.057348188012838364, + "step": 27740 + }, + { + "epoch": 9.256170780520346, + "loss": 0.3621, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "grad_norm": 1.534041404724121, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "learning_rate": 1.1551786158844246e-05, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "loss": 0.2658484876155853, + "step": 27750 + }, + { + "ce_loss": 0.030037011951208115, + "epoch": 9.256170780520346, + "step": 27750 + }, + { + "distill_loss": 0.15437012910842896, + "epoch": 9.256170780520346, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "ref_ce_loss": 0.05552500858902931, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "loss": 0.29277652502059937, + "step": 27750 + }, + { + "ce_loss": 0.03480086103081703, + "epoch": 9.256170780520346, + "step": 27750 + }, + { + "distill_loss": 0.18554575741291046, + "epoch": 9.256170780520346, + "step": 27750 + }, + { + "epoch": 9.256170780520346, + "ref_ce_loss": 0.05078282952308655, + "step": 27750 + }, + { + "epoch": 9.259506337558372, + "loss": 0.3517, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "grad_norm": 0.868651807308197, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "learning_rate": 1.1448910969050363e-05, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "loss": 0.9711523056030273, + "step": 27760 + }, + { + "ce_loss": 0.09152238070964813, + "epoch": 9.259506337558372, + "step": 27760 + }, + { + "distill_loss": 0.22823432087898254, + "epoch": 9.259506337558372, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "ref_ce_loss": 0.07608745247125626, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "loss": 0.34535083174705505, + "step": 27760 + }, + { + "ce_loss": 0.03188411146402359, + "epoch": 9.259506337558372, + "step": 27760 + }, + { + "distill_loss": 0.14663533866405487, + "epoch": 9.259506337558372, + "step": 27760 + }, + { + "epoch": 9.259506337558372, + "ref_ce_loss": 0.0729282945394516, + "step": 27760 + }, + { + "epoch": 9.262841894596397, + "loss": 0.3681, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "grad_norm": 1.2464836835861206, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "learning_rate": 1.1346489260187155e-05, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "loss": 0.2442484349012375, + "step": 27770 + }, + { + "ce_loss": 0.016015250235795975, + "epoch": 9.262841894596397, + "step": 27770 + }, + { + "distill_loss": 0.1256130486726761, + "epoch": 9.262841894596397, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "ref_ce_loss": 0.04880331829190254, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "loss": 0.2674102783203125, + "step": 27770 + }, + { + "ce_loss": 0.01958044432103634, + "epoch": 9.262841894596397, + "step": 27770 + }, + { + "distill_loss": 0.15196123719215393, + "epoch": 9.262841894596397, + "step": 27770 + }, + { + "epoch": 9.262841894596397, + "ref_ce_loss": 0.05983925610780716, + "step": 27770 + }, + { + "epoch": 9.266177451634423, + "loss": 0.3175, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "grad_norm": 1.0320186614990234, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "learning_rate": 1.1244521151791887e-05, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "loss": 0.3113320469856262, + "step": 27780 + }, + { + "ce_loss": 0.034634802490472794, + "epoch": 9.266177451634423, + "step": 27780 + }, + { + "distill_loss": 0.19550350308418274, + "epoch": 9.266177451634423, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "ref_ce_loss": 0.05874716863036156, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "loss": 0.2644667327404022, + "step": 27780 + }, + { + "ce_loss": 0.03832489624619484, + "epoch": 9.266177451634423, + "step": 27780 + }, + { + "distill_loss": 0.11675840616226196, + "epoch": 9.266177451634423, + "step": 27780 + }, + { + "epoch": 9.266177451634423, + "ref_ce_loss": 0.06073426082730293, + "step": 27780 + }, + { + "epoch": 9.269513008672448, + "loss": 0.3293, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "grad_norm": 1.3379724025726318, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "learning_rate": 1.114300676287221e-05, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "loss": 0.42799869179725647, + "step": 27790 + }, + { + "ce_loss": 0.07342212647199631, + "epoch": 9.269513008672448, + "step": 27790 + }, + { + "distill_loss": 0.16694113612174988, + "epoch": 9.269513008672448, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "ref_ce_loss": 0.07477279007434845, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "loss": 0.4181850552558899, + "step": 27790 + }, + { + "ce_loss": 0.04979084059596062, + "epoch": 9.269513008672448, + "step": 27790 + }, + { + "distill_loss": 0.2051783800125122, + "epoch": 9.269513008672448, + "step": 27790 + }, + { + "epoch": 9.269513008672448, + "ref_ce_loss": 0.06781855970621109, + "step": 27790 + }, + { + "epoch": 9.272848565710474, + "loss": 0.325, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "grad_norm": 1.0779036283493042, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "learning_rate": 1.1041946211906418e-05, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "loss": 0.20625613629817963, + "step": 27800 + }, + { + "ce_loss": 0.04753445088863373, + "epoch": 9.272848565710474, + "step": 27800 + }, + { + "distill_loss": 0.12213581055402756, + "epoch": 9.272848565710474, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "ref_ce_loss": 0.03632541745901108, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "loss": 0.35115480422973633, + "step": 27800 + }, + { + "ce_loss": 0.07139122486114502, + "epoch": 9.272848565710474, + "step": 27800 + }, + { + "distill_loss": 0.17072322964668274, + "epoch": 9.272848565710474, + "step": 27800 + }, + { + "epoch": 9.272848565710474, + "ref_ce_loss": 0.06434401869773865, + "step": 27800 + }, + { + "epoch": 9.276184122748498, + "loss": 0.3474, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "grad_norm": 0.9952632784843445, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "learning_rate": 1.0941339616843006e-05, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "loss": 0.4912530779838562, + "step": 27810 + }, + { + "ce_loss": 0.04322037845849991, + "epoch": 9.276184122748498, + "step": 27810 + }, + { + "distill_loss": 0.1929270476102829, + "epoch": 9.276184122748498, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "ref_ce_loss": 0.0678233802318573, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "loss": 0.20256903767585754, + "step": 27810 + }, + { + "ce_loss": 0.022716665640473366, + "epoch": 9.276184122748498, + "step": 27810 + }, + { + "distill_loss": 0.120547354221344, + "epoch": 9.276184122748498, + "step": 27810 + }, + { + "epoch": 9.276184122748498, + "ref_ce_loss": 0.04453602060675621, + "step": 27810 + }, + { + "epoch": 9.279519679786524, + "loss": 0.3334, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "grad_norm": 1.1183052062988281, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "learning_rate": 1.0841187095100668e-05, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "loss": 0.37389904260635376, + "step": 27820 + }, + { + "ce_loss": 0.031152600422501564, + "epoch": 9.279519679786524, + "step": 27820 + }, + { + "distill_loss": 0.13883738219738007, + "epoch": 9.279519679786524, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "ref_ce_loss": 0.048974327743053436, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "loss": 0.40885597467422485, + "step": 27820 + }, + { + "ce_loss": 0.08726421743631363, + "epoch": 9.279519679786524, + "step": 27820 + }, + { + "distill_loss": 0.22419197857379913, + "epoch": 9.279519679786524, + "step": 27820 + }, + { + "epoch": 9.279519679786524, + "ref_ce_loss": 0.09708337485790253, + "step": 27820 + }, + { + "epoch": 9.282855236824549, + "loss": 0.3368, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "grad_norm": 0.9071395397186279, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "learning_rate": 1.0741488763568263e-05, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "loss": 0.2448447197675705, + "step": 27830 + }, + { + "ce_loss": 0.03727051243185997, + "epoch": 9.282855236824549, + "step": 27830 + }, + { + "distill_loss": 0.12109909951686859, + "epoch": 9.282855236824549, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "ref_ce_loss": 0.06266836076974869, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "loss": 0.4689825177192688, + "step": 27830 + }, + { + "ce_loss": 0.033723898231983185, + "epoch": 9.282855236824549, + "step": 27830 + }, + { + "distill_loss": 0.16918250918388367, + "epoch": 9.282855236824549, + "step": 27830 + }, + { + "epoch": 9.282855236824549, + "ref_ce_loss": 0.046082448214292526, + "step": 27830 + }, + { + "epoch": 9.286190793862575, + "loss": 0.3384, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "grad_norm": 1.576418161392212, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "learning_rate": 1.0642244738604356e-05, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "loss": 0.2886964976787567, + "step": 27840 + }, + { + "ce_loss": 0.04928829148411751, + "epoch": 9.286190793862575, + "step": 27840 + }, + { + "distill_loss": 0.1505202353000641, + "epoch": 9.286190793862575, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "ref_ce_loss": 0.059994716197252274, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "loss": 0.31100377440452576, + "step": 27840 + }, + { + "ce_loss": 0.03669671714305878, + "epoch": 9.286190793862575, + "step": 27840 + }, + { + "distill_loss": 0.17948545515537262, + "epoch": 9.286190793862575, + "step": 27840 + }, + { + "epoch": 9.286190793862575, + "ref_ce_loss": 0.05879655480384827, + "step": 27840 + }, + { + "epoch": 9.2895263509006, + "loss": 0.3843, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "grad_norm": 1.546885371208191, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "learning_rate": 1.0543455136037495e-05, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "loss": 0.35940316319465637, + "step": 27850 + }, + { + "ce_loss": 0.059567444026470184, + "epoch": 9.2895263509006, + "step": 27850 + }, + { + "distill_loss": 0.20273186266422272, + "epoch": 9.2895263509006, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "ref_ce_loss": 0.06466156989336014, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "loss": 0.4103323817253113, + "step": 27850 + }, + { + "ce_loss": 0.044299185276031494, + "epoch": 9.2895263509006, + "step": 27850 + }, + { + "distill_loss": 0.15502387285232544, + "epoch": 9.2895263509006, + "step": 27850 + }, + { + "epoch": 9.2895263509006, + "ref_ce_loss": 0.0782664492726326, + "step": 27850 + }, + { + "epoch": 9.292861907938626, + "loss": 0.4034, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "grad_norm": 2.5437092781066895, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "learning_rate": 1.0445120071165759e-05, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "loss": 0.2795525789260864, + "step": 27860 + }, + { + "ce_loss": 0.050536125898361206, + "epoch": 9.292861907938626, + "step": 27860 + }, + { + "distill_loss": 0.16047285497188568, + "epoch": 9.292861907938626, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "ref_ce_loss": 0.047581858932971954, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "loss": 0.24045586585998535, + "step": 27860 + }, + { + "ce_loss": 0.027032705023884773, + "epoch": 9.292861907938626, + "step": 27860 + }, + { + "distill_loss": 0.15831856429576874, + "epoch": 9.292861907938626, + "step": 27860 + }, + { + "epoch": 9.292861907938626, + "ref_ce_loss": 0.05472426861524582, + "step": 27860 + }, + { + "epoch": 9.29619746497665, + "loss": 0.3288, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "grad_norm": 1.1688228845596313, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "learning_rate": 1.034723965875677e-05, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "loss": 0.33194631338119507, + "step": 27870 + }, + { + "ce_loss": 0.031508613377809525, + "epoch": 9.29619746497665, + "step": 27870 + }, + { + "distill_loss": 0.1925259530544281, + "epoch": 9.29619746497665, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "ref_ce_loss": 0.0777515098452568, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "loss": 0.2267584502696991, + "step": 27870 + }, + { + "ce_loss": 0.01810150034725666, + "epoch": 9.29619746497665, + "step": 27870 + }, + { + "distill_loss": 0.1645008623600006, + "epoch": 9.29619746497665, + "step": 27870 + }, + { + "epoch": 9.29619746497665, + "ref_ce_loss": 0.04402892664074898, + "step": 27870 + }, + { + "epoch": 9.299533022014677, + "loss": 0.3294, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "grad_norm": 1.1693073511123657, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "learning_rate": 1.0249814013047455e-05, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "loss": 0.37019437551498413, + "step": 27880 + }, + { + "ce_loss": 0.04375609755516052, + "epoch": 9.299533022014677, + "step": 27880 + }, + { + "distill_loss": 0.15550339221954346, + "epoch": 9.299533022014677, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "ref_ce_loss": 0.06085462495684624, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "loss": 0.30347010493278503, + "step": 27880 + }, + { + "ce_loss": 0.05368916317820549, + "epoch": 9.299533022014677, + "step": 27880 + }, + { + "distill_loss": 0.16740760207176208, + "epoch": 9.299533022014677, + "step": 27880 + }, + { + "epoch": 9.299533022014677, + "ref_ce_loss": 0.08216187357902527, + "step": 27880 + }, + { + "epoch": 9.302868579052701, + "loss": 0.3902, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "grad_norm": 1.1709650754928589, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "learning_rate": 1.0152843247744015e-05, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "loss": 0.22320252656936646, + "step": 27890 + }, + { + "ce_loss": 0.02152726799249649, + "epoch": 9.302868579052701, + "step": 27890 + }, + { + "distill_loss": 0.14967907965183258, + "epoch": 9.302868579052701, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "ref_ce_loss": 0.05175358057022095, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "loss": 0.23550841212272644, + "step": 27890 + }, + { + "ce_loss": 0.024001408368349075, + "epoch": 9.302868579052701, + "step": 27890 + }, + { + "distill_loss": 0.13121449947357178, + "epoch": 9.302868579052701, + "step": 27890 + }, + { + "epoch": 9.302868579052701, + "ref_ce_loss": 0.0575178898870945, + "step": 27890 + }, + { + "epoch": 9.306204136090727, + "loss": 0.3341, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "grad_norm": 1.2723243236541748, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "learning_rate": 1.0056327476021831e-05, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "loss": 0.2968706488609314, + "step": 27900 + }, + { + "ce_loss": 0.025245482102036476, + "epoch": 9.306204136090727, + "step": 27900 + }, + { + "distill_loss": 0.14647573232650757, + "epoch": 9.306204136090727, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "ref_ce_loss": 0.05946638807654381, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "loss": 0.2807845175266266, + "step": 27900 + }, + { + "ce_loss": 0.030696626752614975, + "epoch": 9.306204136090727, + "step": 27900 + }, + { + "distill_loss": 0.18408195674419403, + "epoch": 9.306204136090727, + "step": 27900 + }, + { + "epoch": 9.306204136090727, + "ref_ce_loss": 0.06579753011465073, + "step": 27900 + }, + { + "epoch": 9.309539693128752, + "loss": 0.3317, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "grad_norm": 0.8362427353858948, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "learning_rate": 9.96026681052511e-06, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "loss": 0.5383756160736084, + "step": 27910 + }, + { + "ce_loss": 0.03612060472369194, + "epoch": 9.309539693128752, + "step": 27910 + }, + { + "distill_loss": 0.18417330086231232, + "epoch": 9.309539693128752, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "ref_ce_loss": 0.05404425412416458, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "loss": 0.26968586444854736, + "step": 27910 + }, + { + "ce_loss": 0.04281514510512352, + "epoch": 9.309539693128752, + "step": 27910 + }, + { + "distill_loss": 0.13925480842590332, + "epoch": 9.309539693128752, + "step": 27910 + }, + { + "epoch": 9.309539693128752, + "ref_ce_loss": 0.06350491940975189, + "step": 27910 + }, + { + "epoch": 9.312875250166778, + "loss": 0.3545, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "grad_norm": 1.2448768615722656, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "learning_rate": 9.864661363367101e-06, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "loss": 0.2633543312549591, + "step": 27920 + }, + { + "ce_loss": 0.038385823369026184, + "epoch": 9.312875250166778, + "step": 27920 + }, + { + "distill_loss": 0.14372439682483673, + "epoch": 9.312875250166778, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "ref_ce_loss": 0.061395276337862015, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "loss": 0.46539098024368286, + "step": 27920 + }, + { + "ce_loss": 0.06722792237997055, + "epoch": 9.312875250166778, + "step": 27920 + }, + { + "distill_loss": 0.1808156967163086, + "epoch": 9.312875250166778, + "step": 27920 + }, + { + "epoch": 9.312875250166778, + "ref_ce_loss": 0.06751332432031631, + "step": 27920 + }, + { + "epoch": 9.316210807204802, + "loss": 0.357, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "grad_norm": 1.2320008277893066, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "learning_rate": 9.769511246129526e-06, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "loss": 0.35350170731544495, + "step": 27930 + }, + { + "ce_loss": 0.01923227868974209, + "epoch": 9.316210807204802, + "step": 27930 + }, + { + "distill_loss": 0.16920588910579681, + "epoch": 9.316210807204802, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "ref_ce_loss": 0.057839956134557724, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "loss": 0.348351389169693, + "step": 27930 + }, + { + "ce_loss": 0.07113120704889297, + "epoch": 9.316210807204802, + "step": 27930 + }, + { + "distill_loss": 0.17669452726840973, + "epoch": 9.316210807204802, + "step": 27930 + }, + { + "epoch": 9.316210807204802, + "ref_ce_loss": 0.07339446991682053, + "step": 27930 + }, + { + "epoch": 9.319546364242829, + "loss": 0.3476, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "grad_norm": 0.8937927484512329, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "learning_rate": 9.674816569862887e-06, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "loss": 0.28209495544433594, + "step": 27940 + }, + { + "ce_loss": 0.01969868130981922, + "epoch": 9.319546364242829, + "step": 27940 + }, + { + "distill_loss": 0.13597862422466278, + "epoch": 9.319546364242829, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "ref_ce_loss": 0.06439182907342911, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "loss": 0.36773681640625, + "step": 27940 + }, + { + "ce_loss": 0.04736010730266571, + "epoch": 9.319546364242829, + "step": 27940 + }, + { + "distill_loss": 0.16696158051490784, + "epoch": 9.319546364242829, + "step": 27940 + }, + { + "epoch": 9.319546364242829, + "ref_ce_loss": 0.05129920691251755, + "step": 27940 + }, + { + "epoch": 9.322881921280853, + "loss": 0.348, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "grad_norm": 1.2185635566711426, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "learning_rate": 9.580577445086025e-06, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "loss": 0.3241177201271057, + "step": 27950 + }, + { + "ce_loss": 0.04607256501913071, + "epoch": 9.322881921280853, + "step": 27950 + }, + { + "distill_loss": 0.1539202630519867, + "epoch": 9.322881921280853, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "ref_ce_loss": 0.054715596139431, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "loss": 0.3932846188545227, + "step": 27950 + }, + { + "ce_loss": 0.029749859124422073, + "epoch": 9.322881921280853, + "step": 27950 + }, + { + "distill_loss": 0.20683810114860535, + "epoch": 9.322881921280853, + "step": 27950 + }, + { + "epoch": 9.322881921280853, + "ref_ce_loss": 0.07392022013664246, + "step": 27950 + }, + { + "epoch": 9.32621747831888, + "loss": 0.3126, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "grad_norm": 0.9083806872367859, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "learning_rate": 9.486793981786158e-06, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "loss": 0.3950866758823395, + "step": 27960 + }, + { + "ce_loss": 0.06246607378125191, + "epoch": 9.32621747831888, + "step": 27960 + }, + { + "distill_loss": 0.2375527322292328, + "epoch": 9.32621747831888, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "ref_ce_loss": 0.06229804828763008, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "loss": 0.326924592256546, + "step": 27960 + }, + { + "ce_loss": 0.03776393085718155, + "epoch": 9.32621747831888, + "step": 27960 + }, + { + "distill_loss": 0.1792905628681183, + "epoch": 9.32621747831888, + "step": 27960 + }, + { + "epoch": 9.32621747831888, + "ref_ce_loss": 0.053115006536245346, + "step": 27960 + }, + { + "epoch": 9.329553035356904, + "loss": 0.3583, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "grad_norm": 0.990890622138977, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "learning_rate": 9.393466289418662e-06, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "loss": 0.3162615895271301, + "step": 27970 + }, + { + "ce_loss": 0.055841028690338135, + "epoch": 9.329553035356904, + "step": 27970 + }, + { + "distill_loss": 0.16669586300849915, + "epoch": 9.329553035356904, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "ref_ce_loss": 0.06520682573318481, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "loss": 0.7081807851791382, + "step": 27970 + }, + { + "ce_loss": 0.05520046129822731, + "epoch": 9.329553035356904, + "step": 27970 + }, + { + "distill_loss": 0.17978860437870026, + "epoch": 9.329553035356904, + "step": 27970 + }, + { + "epoch": 9.329553035356904, + "ref_ce_loss": 0.06650884449481964, + "step": 27970 + }, + { + "epoch": 9.33288859239493, + "loss": 0.3937, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "grad_norm": 1.0608779191970825, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "learning_rate": 9.300594476907031e-06, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "loss": 0.3241865932941437, + "step": 27980 + }, + { + "ce_loss": 0.05885559692978859, + "epoch": 9.33288859239493, + "step": 27980 + }, + { + "distill_loss": 0.13292686641216278, + "epoch": 9.33288859239493, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "ref_ce_loss": 0.0678175836801529, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "loss": 0.2897075414657593, + "step": 27980 + }, + { + "ce_loss": 0.01826099492609501, + "epoch": 9.33288859239493, + "step": 27980 + }, + { + "distill_loss": 0.2000298947095871, + "epoch": 9.33288859239493, + "step": 27980 + }, + { + "epoch": 9.33288859239493, + "ref_ce_loss": 0.050724390894174576, + "step": 27980 + }, + { + "epoch": 9.336224149432955, + "loss": 0.3584, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "grad_norm": 1.5480546951293945, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "learning_rate": 9.208178652642651e-06, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "loss": 0.39300987124443054, + "step": 27990 + }, + { + "ce_loss": 0.020847437903285027, + "epoch": 9.336224149432955, + "step": 27990 + }, + { + "distill_loss": 0.15032193064689636, + "epoch": 9.336224149432955, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "ref_ce_loss": 0.06037368252873421, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "loss": 1.7793481349945068, + "step": 27990 + }, + { + "ce_loss": 0.06162801384925842, + "epoch": 9.336224149432955, + "step": 27990 + }, + { + "distill_loss": 0.2171134352684021, + "epoch": 9.336224149432955, + "step": 27990 + }, + { + "epoch": 9.336224149432955, + "ref_ce_loss": 0.08113553375005722, + "step": 27990 + }, + { + "epoch": 9.33955970647098, + "loss": 0.4329, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "grad_norm": 2.510063648223877, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "learning_rate": 9.11621892448471e-06, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "loss": 0.3115657567977905, + "step": 28000 + }, + { + "ce_loss": 0.04781182110309601, + "epoch": 9.33955970647098, + "step": 28000 + }, + { + "distill_loss": 0.19052788615226746, + "epoch": 9.33955970647098, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "ref_ce_loss": 0.05586477741599083, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "loss": 0.3089342415332794, + "step": 28000 + }, + { + "ce_loss": 0.039804648607969284, + "epoch": 9.33955970647098, + "step": 28000 + }, + { + "distill_loss": 0.18772819638252258, + "epoch": 9.33955970647098, + "step": 28000 + }, + { + "epoch": 9.33955970647098, + "ref_ce_loss": 0.05897476524114609, + "step": 28000 + }, + { + "epoch": 9.342895263509005, + "loss": 0.3561, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "grad_norm": 1.1208604574203491, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "learning_rate": 9.02471539976011e-06, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "loss": 0.3297306299209595, + "step": 28010 + }, + { + "ce_loss": 0.055515870451927185, + "epoch": 9.342895263509005, + "step": 28010 + }, + { + "distill_loss": 0.18339870870113373, + "epoch": 9.342895263509005, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "ref_ce_loss": 0.06366925686597824, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "loss": 0.381563276052475, + "step": 28010 + }, + { + "ce_loss": 0.06505367904901505, + "epoch": 9.342895263509005, + "step": 28010 + }, + { + "distill_loss": 0.2163468599319458, + "epoch": 9.342895263509005, + "step": 28010 + }, + { + "epoch": 9.342895263509005, + "ref_ce_loss": 0.06672647595405579, + "step": 28010 + }, + { + "epoch": 9.346230820547031, + "loss": 0.3435, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "grad_norm": 1.4113551378250122, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "learning_rate": 8.933668185263288e-06, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "loss": 0.2933332324028015, + "step": 28020 + }, + { + "ce_loss": 0.06315771490335464, + "epoch": 9.346230820547031, + "step": 28020 + }, + { + "distill_loss": 0.15205860137939453, + "epoch": 9.346230820547031, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "ref_ce_loss": 0.054141364991664886, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "loss": 0.2890551686286926, + "step": 28020 + }, + { + "ce_loss": 0.031157713383436203, + "epoch": 9.346230820547031, + "step": 28020 + }, + { + "distill_loss": 0.15079186856746674, + "epoch": 9.346230820547031, + "step": 28020 + }, + { + "epoch": 9.346230820547031, + "ref_ce_loss": 0.05227925628423691, + "step": 28020 + }, + { + "epoch": 9.349566377585056, + "loss": 0.3245, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "grad_norm": 1.0965887308120728, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "learning_rate": 8.843077387256271e-06, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "loss": 0.2993690073490143, + "step": 28030 + }, + { + "ce_loss": 0.04467551410198212, + "epoch": 9.349566377585056, + "step": 28030 + }, + { + "distill_loss": 0.1874786913394928, + "epoch": 9.349566377585056, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "ref_ce_loss": 0.06677830219268799, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "loss": 0.3004530370235443, + "step": 28030 + }, + { + "ce_loss": 0.02141544595360756, + "epoch": 9.349566377585056, + "step": 28030 + }, + { + "distill_loss": 0.13225886225700378, + "epoch": 9.349566377585056, + "step": 28030 + }, + { + "epoch": 9.349566377585056, + "ref_ce_loss": 0.04972206801176071, + "step": 28030 + }, + { + "epoch": 9.352901934623082, + "loss": 0.3292, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "grad_norm": 1.0150585174560547, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "learning_rate": 8.752943111468082e-06, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "loss": 0.3618951737880707, + "step": 28040 + }, + { + "ce_loss": 0.0385294072329998, + "epoch": 9.352901934623082, + "step": 28040 + }, + { + "distill_loss": 0.19680960476398468, + "epoch": 9.352901934623082, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "ref_ce_loss": 0.08438844233751297, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "loss": 0.3036952316761017, + "step": 28040 + }, + { + "ce_loss": 0.021761497482657433, + "epoch": 9.352901934623082, + "step": 28040 + }, + { + "distill_loss": 0.15879257023334503, + "epoch": 9.352901934623082, + "step": 28040 + }, + { + "epoch": 9.352901934623082, + "ref_ce_loss": 0.056911952793598175, + "step": 28040 + }, + { + "epoch": 9.356237491661107, + "loss": 0.3486, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "grad_norm": 1.1633007526397705, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "learning_rate": 8.663265463095238e-06, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "loss": 0.3508574068546295, + "step": 28050 + }, + { + "ce_loss": 0.06212170422077179, + "epoch": 9.356237491661107, + "step": 28050 + }, + { + "distill_loss": 0.20021311938762665, + "epoch": 9.356237491661107, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "ref_ce_loss": 0.08815666288137436, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "loss": 0.2556230425834656, + "step": 28050 + }, + { + "ce_loss": 0.03384950011968613, + "epoch": 9.356237491661107, + "step": 28050 + }, + { + "distill_loss": 0.1471124291419983, + "epoch": 9.356237491661107, + "step": 28050 + }, + { + "epoch": 9.356237491661107, + "ref_ce_loss": 0.05072746053338051, + "step": 28050 + }, + { + "epoch": 9.359573048699133, + "loss": 0.3159, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "grad_norm": 0.8221273422241211, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "learning_rate": 8.57404454680113e-06, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "loss": 0.6741282343864441, + "step": 28060 + }, + { + "ce_loss": 0.10154782235622406, + "epoch": 9.359573048699133, + "step": 28060 + }, + { + "distill_loss": 0.24765446782112122, + "epoch": 9.359573048699133, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "ref_ce_loss": 0.09763228893280029, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "loss": 0.23623378574848175, + "step": 28060 + }, + { + "ce_loss": 0.037810999900102615, + "epoch": 9.359573048699133, + "step": 28060 + }, + { + "distill_loss": 0.14654332399368286, + "epoch": 9.359573048699133, + "step": 28060 + }, + { + "epoch": 9.359573048699133, + "ref_ce_loss": 0.051665063947439194, + "step": 28060 + }, + { + "epoch": 9.362908605737157, + "loss": 0.3258, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "grad_norm": 0.7860631346702576, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "learning_rate": 8.485280466716284e-06, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "loss": 0.2561877965927124, + "step": 28070 + }, + { + "ce_loss": 0.045687541365623474, + "epoch": 9.362908605737157, + "step": 28070 + }, + { + "distill_loss": 0.16708163917064667, + "epoch": 9.362908605737157, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "ref_ce_loss": 0.04313550889492035, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "loss": 0.1792113482952118, + "step": 28070 + }, + { + "ce_loss": 0.012736702337861061, + "epoch": 9.362908605737157, + "step": 28070 + }, + { + "distill_loss": 0.1242901012301445, + "epoch": 9.362908605737157, + "step": 28070 + }, + { + "epoch": 9.362908605737157, + "ref_ce_loss": 0.028838559985160828, + "step": 28070 + }, + { + "epoch": 9.366244162775184, + "loss": 0.347, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "grad_norm": 1.0601820945739746, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "learning_rate": 8.39697332643783e-06, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "loss": 0.4131063222885132, + "step": 28080 + }, + { + "ce_loss": 0.040342796593904495, + "epoch": 9.366244162775184, + "step": 28080 + }, + { + "distill_loss": 0.22460660338401794, + "epoch": 9.366244162775184, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "ref_ce_loss": 0.08412070572376251, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "loss": 0.38089701533317566, + "step": 28080 + }, + { + "ce_loss": 0.03375541418790817, + "epoch": 9.366244162775184, + "step": 28080 + }, + { + "distill_loss": 0.15850619971752167, + "epoch": 9.366244162775184, + "step": 28080 + }, + { + "epoch": 9.366244162775184, + "ref_ce_loss": 0.06123631075024605, + "step": 28080 + }, + { + "epoch": 9.369579719813208, + "loss": 0.3772, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "grad_norm": 1.264539122581482, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "learning_rate": 8.30912322902968e-06, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "loss": 0.31226345896720886, + "step": 28090 + }, + { + "ce_loss": 0.047768257558345795, + "epoch": 9.369579719813208, + "step": 28090 + }, + { + "distill_loss": 0.17230334877967834, + "epoch": 9.369579719813208, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "ref_ce_loss": 0.06894991546869278, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "loss": 0.3473667502403259, + "step": 28090 + }, + { + "ce_loss": 0.05084200203418732, + "epoch": 9.369579719813208, + "step": 28090 + }, + { + "distill_loss": 0.1941516399383545, + "epoch": 9.369579719813208, + "step": 28090 + }, + { + "epoch": 9.369579719813208, + "ref_ce_loss": 0.07747070491313934, + "step": 28090 + }, + { + "epoch": 9.372915276851234, + "loss": 0.3396, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "grad_norm": 2.1545708179473877, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "learning_rate": 8.221730277022488e-06, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "loss": 0.8439842462539673, + "step": 28100 + }, + { + "ce_loss": 0.044698260724544525, + "epoch": 9.372915276851234, + "step": 28100 + }, + { + "distill_loss": 0.18024414777755737, + "epoch": 9.372915276851234, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "ref_ce_loss": 0.049982912838459015, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "loss": 0.6536636352539062, + "step": 28100 + }, + { + "ce_loss": 0.03565140813589096, + "epoch": 9.372915276851234, + "step": 28100 + }, + { + "distill_loss": 0.18457381427288055, + "epoch": 9.372915276851234, + "step": 28100 + }, + { + "epoch": 9.372915276851234, + "ref_ce_loss": 0.053002066910266876, + "step": 28100 + }, + { + "epoch": 9.376250833889259, + "loss": 0.4018, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "grad_norm": 1.1988686323165894, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "learning_rate": 8.134794572413106e-06, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "loss": 0.3881773352622986, + "step": 28110 + }, + { + "ce_loss": 0.055975291877985, + "epoch": 9.376250833889259, + "step": 28110 + }, + { + "distill_loss": 0.1879711151123047, + "epoch": 9.376250833889259, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "ref_ce_loss": 0.07531239837408066, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "loss": 0.4659346342086792, + "step": 28110 + }, + { + "ce_loss": 0.045423828065395355, + "epoch": 9.376250833889259, + "step": 28110 + }, + { + "distill_loss": 0.14331869781017303, + "epoch": 9.376250833889259, + "step": 28110 + }, + { + "epoch": 9.376250833889259, + "ref_ce_loss": 0.08407054096460342, + "step": 28110 + }, + { + "epoch": 9.379586390927285, + "loss": 0.3772, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "grad_norm": 1.0428130626678467, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "learning_rate": 8.048316216664908e-06, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "loss": 0.40925315022468567, + "step": 28120 + }, + { + "ce_loss": 0.036471374332904816, + "epoch": 9.379586390927285, + "step": 28120 + }, + { + "distill_loss": 0.20590730011463165, + "epoch": 9.379586390927285, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "ref_ce_loss": 0.07913188636302948, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "loss": 0.44879019260406494, + "step": 28120 + }, + { + "ce_loss": 0.06290306150913239, + "epoch": 9.379586390927285, + "step": 28120 + }, + { + "distill_loss": 0.19268600642681122, + "epoch": 9.379586390927285, + "step": 28120 + }, + { + "epoch": 9.379586390927285, + "ref_ce_loss": 0.07037319988012314, + "step": 28120 + }, + { + "epoch": 9.38292194796531, + "loss": 0.3721, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "grad_norm": 1.0024430751800537, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "learning_rate": 7.962295310707424e-06, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "loss": 0.3708806335926056, + "step": 28130 + }, + { + "ce_loss": 0.020602921023964882, + "epoch": 9.38292194796531, + "step": 28130 + }, + { + "distill_loss": 0.13433831930160522, + "epoch": 9.38292194796531, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "ref_ce_loss": 0.053088247776031494, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "loss": 0.24727138876914978, + "step": 28130 + }, + { + "ce_loss": 0.030050475150346756, + "epoch": 9.38292194796531, + "step": 28130 + }, + { + "distill_loss": 0.14565815031528473, + "epoch": 9.38292194796531, + "step": 28130 + }, + { + "epoch": 9.38292194796531, + "ref_ce_loss": 0.07143155485391617, + "step": 28130 + }, + { + "epoch": 9.386257505003336, + "loss": 0.3378, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "grad_norm": 0.9009705781936646, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "learning_rate": 7.876731954936346e-06, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "loss": 0.27454906702041626, + "step": 28140 + }, + { + "ce_loss": 0.03395192325115204, + "epoch": 9.386257505003336, + "step": 28140 + }, + { + "distill_loss": 0.13065120577812195, + "epoch": 9.386257505003336, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "ref_ce_loss": 0.051687318831682205, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "loss": 0.3121086657047272, + "step": 28140 + }, + { + "ce_loss": 0.04262286424636841, + "epoch": 9.386257505003336, + "step": 28140 + }, + { + "distill_loss": 0.18090584874153137, + "epoch": 9.386257505003336, + "step": 28140 + }, + { + "epoch": 9.386257505003336, + "ref_ce_loss": 0.06977871805429459, + "step": 28140 + }, + { + "epoch": 9.38959306204136, + "loss": 0.3314, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "grad_norm": 1.156765103340149, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "learning_rate": 7.791626249213301e-06, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "loss": 0.5182572603225708, + "step": 28150 + }, + { + "ce_loss": 0.028388435021042824, + "epoch": 9.38959306204136, + "step": 28150 + }, + { + "distill_loss": 0.1807260513305664, + "epoch": 9.38959306204136, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "ref_ce_loss": 0.06478993594646454, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "loss": 0.3600061237812042, + "step": 28150 + }, + { + "ce_loss": 0.03588123619556427, + "epoch": 9.38959306204136, + "step": 28150 + }, + { + "distill_loss": 0.21699689328670502, + "epoch": 9.38959306204136, + "step": 28150 + }, + { + "epoch": 9.38959306204136, + "ref_ce_loss": 0.057673607021570206, + "step": 28150 + }, + { + "epoch": 9.392928619079386, + "loss": 0.3238, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "grad_norm": 0.9872157573699951, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "learning_rate": 7.70697829286573e-06, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "loss": 0.3103258013725281, + "step": 28160 + }, + { + "ce_loss": 0.02353808470070362, + "epoch": 9.392928619079386, + "step": 28160 + }, + { + "distill_loss": 0.17616422474384308, + "epoch": 9.392928619079386, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "ref_ce_loss": 0.057305362075567245, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "loss": 0.2516862452030182, + "step": 28160 + }, + { + "ce_loss": 0.015492540784180164, + "epoch": 9.392928619079386, + "step": 28160 + }, + { + "distill_loss": 0.14997431635856628, + "epoch": 9.392928619079386, + "step": 28160 + }, + { + "epoch": 9.392928619079386, + "ref_ce_loss": 0.05999534949660301, + "step": 28160 + }, + { + "epoch": 9.39626417611741, + "loss": 0.326, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "grad_norm": 0.9308201670646667, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "learning_rate": 7.622788184686958e-06, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "loss": 0.31401658058166504, + "step": 28170 + }, + { + "ce_loss": 0.028999431058764458, + "epoch": 9.39626417611741, + "step": 28170 + }, + { + "distill_loss": 0.1737707108259201, + "epoch": 9.39626417611741, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "ref_ce_loss": 0.05012731999158859, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "loss": 0.4022290110588074, + "step": 28170 + }, + { + "ce_loss": 0.05216839537024498, + "epoch": 9.39626417611741, + "step": 28170 + }, + { + "distill_loss": 0.21104665100574493, + "epoch": 9.39626417611741, + "step": 28170 + }, + { + "epoch": 9.39626417611741, + "ref_ce_loss": 0.07470600306987762, + "step": 28170 + }, + { + "epoch": 9.399599733155437, + "loss": 0.3583, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "grad_norm": 1.374057412147522, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "learning_rate": 7.539056022935986e-06, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "loss": 0.38167569041252136, + "step": 28180 + }, + { + "ce_loss": 0.055582933127880096, + "epoch": 9.399599733155437, + "step": 28180 + }, + { + "distill_loss": 0.19373860955238342, + "epoch": 9.399599733155437, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "ref_ce_loss": 0.062473032623529434, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "loss": 0.3486148416996002, + "step": 28180 + }, + { + "ce_loss": 0.025748463347554207, + "epoch": 9.399599733155437, + "step": 28180 + }, + { + "distill_loss": 0.16828617453575134, + "epoch": 9.399599733155437, + "step": 28180 + }, + { + "epoch": 9.399599733155437, + "ref_ce_loss": 0.05271229147911072, + "step": 28180 + }, + { + "epoch": 9.402935290193462, + "loss": 0.3882, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "grad_norm": 1.0406385660171509, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "learning_rate": 7.455781905337089e-06, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "loss": 0.3039928078651428, + "step": 28190 + }, + { + "ce_loss": 0.07042131572961807, + "epoch": 9.402935290193462, + "step": 28190 + }, + { + "distill_loss": 0.1568891704082489, + "epoch": 9.402935290193462, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "ref_ce_loss": 0.057233791798353195, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "loss": 0.68961101770401, + "step": 28190 + }, + { + "ce_loss": 0.022175131365656853, + "epoch": 9.402935290193462, + "step": 28190 + }, + { + "distill_loss": 0.1406789869070053, + "epoch": 9.402935290193462, + "step": 28190 + }, + { + "epoch": 9.402935290193462, + "ref_ce_loss": 0.05205327644944191, + "step": 28190 + }, + { + "epoch": 9.406270847231488, + "loss": 0.345, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "grad_norm": 1.2920187711715698, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "learning_rate": 7.3729659290802555e-06, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "loss": 0.4167248010635376, + "step": 28200 + }, + { + "ce_loss": 0.0833716168999672, + "epoch": 9.406270847231488, + "step": 28200 + }, + { + "distill_loss": 0.21213626861572266, + "epoch": 9.406270847231488, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "ref_ce_loss": 0.1018867939710617, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "loss": 0.3711719512939453, + "step": 28200 + }, + { + "ce_loss": 0.06522081792354584, + "epoch": 9.406270847231488, + "step": 28200 + }, + { + "distill_loss": 0.17841891944408417, + "epoch": 9.406270847231488, + "step": 28200 + }, + { + "epoch": 9.406270847231488, + "ref_ce_loss": 0.06807571649551392, + "step": 28200 + }, + { + "epoch": 9.409606404269512, + "loss": 0.3968, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "grad_norm": 0.9074429273605347, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "learning_rate": 7.2906081908206135e-06, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "loss": 0.23347681760787964, + "step": 28210 + }, + { + "ce_loss": 0.034057483077049255, + "epoch": 9.409606404269512, + "step": 28210 + }, + { + "distill_loss": 0.1385912448167801, + "epoch": 9.409606404269512, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "ref_ce_loss": 0.06067446991801262, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "loss": 0.4569370150566101, + "step": 28210 + }, + { + "ce_loss": 0.03701329976320267, + "epoch": 9.409606404269512, + "step": 28210 + }, + { + "distill_loss": 0.17066524922847748, + "epoch": 9.409606404269512, + "step": 28210 + }, + { + "epoch": 9.409606404269512, + "ref_ce_loss": 0.062454596161842346, + "step": 28210 + }, + { + "epoch": 9.412941961307538, + "loss": 0.3649, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "grad_norm": 0.9702861905097961, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "learning_rate": 7.2087087866784755e-06, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "loss": 0.3839876055717468, + "step": 28220 + }, + { + "ce_loss": 0.0502345971763134, + "epoch": 9.412941961307538, + "step": 28220 + }, + { + "distill_loss": 0.16278576850891113, + "epoch": 9.412941961307538, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "ref_ce_loss": 0.07953373342752457, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "loss": 0.3641839027404785, + "step": 28220 + }, + { + "ce_loss": 0.036535587161779404, + "epoch": 9.412941961307538, + "step": 28220 + }, + { + "distill_loss": 0.14896996319293976, + "epoch": 9.412941961307538, + "step": 28220 + }, + { + "epoch": 9.412941961307538, + "ref_ce_loss": 0.08135932683944702, + "step": 28220 + }, + { + "epoch": 9.416277518345563, + "loss": 0.349, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "grad_norm": 1.0224323272705078, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "learning_rate": 7.127267812239335e-06, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "loss": 0.2518828511238098, + "step": 28230 + }, + { + "ce_loss": 0.02607305347919464, + "epoch": 9.416277518345563, + "step": 28230 + }, + { + "distill_loss": 0.14921921491622925, + "epoch": 9.416277518345563, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "ref_ce_loss": 0.04809458181262016, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "loss": 0.430438756942749, + "step": 28230 + }, + { + "ce_loss": 0.027637429535388947, + "epoch": 9.416277518345563, + "step": 28230 + }, + { + "distill_loss": 0.1614387333393097, + "epoch": 9.416277518345563, + "step": 28230 + }, + { + "epoch": 9.416277518345563, + "ref_ce_loss": 0.057256538420915604, + "step": 28230 + }, + { + "epoch": 9.41961307538359, + "loss": 0.3586, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "grad_norm": 1.3031396865844727, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "learning_rate": 7.046285362553428e-06, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "loss": 0.3229510486125946, + "step": 28240 + }, + { + "ce_loss": 0.024988116696476936, + "epoch": 9.41961307538359, + "step": 28240 + }, + { + "distill_loss": 0.195112943649292, + "epoch": 9.41961307538359, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "ref_ce_loss": 0.06355010718107224, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "loss": 0.1794387400150299, + "step": 28240 + }, + { + "ce_loss": 0.008832786232233047, + "epoch": 9.41961307538359, + "step": 28240 + }, + { + "distill_loss": 0.13273176550865173, + "epoch": 9.41961307538359, + "step": 28240 + }, + { + "epoch": 9.41961307538359, + "ref_ce_loss": 0.037653353065252304, + "step": 28240 + }, + { + "epoch": 9.422948632421614, + "loss": 0.3492, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "grad_norm": 1.0684586763381958, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "learning_rate": 6.9657615321361284e-06, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "loss": 0.3427239656448364, + "step": 28250 + }, + { + "ce_loss": 0.056136444211006165, + "epoch": 9.422948632421614, + "step": 28250 + }, + { + "distill_loss": 0.19746029376983643, + "epoch": 9.422948632421614, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "ref_ce_loss": 0.0558629184961319, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "loss": 0.36456623673439026, + "step": 28250 + }, + { + "ce_loss": 0.04950832575559616, + "epoch": 9.422948632421614, + "step": 28250 + }, + { + "distill_loss": 0.18908393383026123, + "epoch": 9.422948632421614, + "step": 28250 + }, + { + "epoch": 9.422948632421614, + "ref_ce_loss": 0.06433885544538498, + "step": 28250 + }, + { + "epoch": 9.42628418945964, + "loss": 0.336, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "grad_norm": 0.9209970831871033, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "learning_rate": 6.885696414967324e-06, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "loss": 0.29922136664390564, + "step": 28260 + }, + { + "ce_loss": 0.02344970777630806, + "epoch": 9.42628418945964, + "step": 28260 + }, + { + "distill_loss": 0.18124236166477203, + "epoch": 9.42628418945964, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "ref_ce_loss": 0.05942835658788681, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "loss": 0.34051281213760376, + "step": 28260 + }, + { + "ce_loss": 0.052809569984674454, + "epoch": 9.42628418945964, + "step": 28260 + }, + { + "distill_loss": 0.17449237406253815, + "epoch": 9.42628418945964, + "step": 28260 + }, + { + "epoch": 9.42628418945964, + "ref_ce_loss": 0.07341314107179642, + "step": 28260 + }, + { + "epoch": 9.429619746497664, + "loss": 0.326, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "grad_norm": 1.575285792350769, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "learning_rate": 6.806090104491691e-06, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "loss": 0.35301825404167175, + "step": 28270 + }, + { + "ce_loss": 0.04694546386599541, + "epoch": 9.429619746497664, + "step": 28270 + }, + { + "distill_loss": 0.14384940266609192, + "epoch": 9.429619746497664, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "ref_ce_loss": 0.06755001097917557, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "loss": 0.23267331719398499, + "step": 28270 + }, + { + "ce_loss": 0.021335484459996223, + "epoch": 9.429619746497664, + "step": 28270 + }, + { + "distill_loss": 0.1247900053858757, + "epoch": 9.429619746497664, + "step": 28270 + }, + { + "epoch": 9.429619746497664, + "ref_ce_loss": 0.058344725519418716, + "step": 28270 + }, + { + "epoch": 9.43295530353569, + "loss": 0.3326, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "grad_norm": 0.9571400880813599, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "learning_rate": 6.726942693618243e-06, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "loss": 0.3410034477710724, + "step": 28280 + }, + { + "ce_loss": 0.04474620148539543, + "epoch": 9.43295530353569, + "step": 28280 + }, + { + "distill_loss": 0.19685468077659607, + "epoch": 9.43295530353569, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "ref_ce_loss": 0.05485452339053154, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "loss": 0.3124542534351349, + "step": 28280 + }, + { + "ce_loss": 0.04805135726928711, + "epoch": 9.43295530353569, + "step": 28280 + }, + { + "distill_loss": 0.17606854438781738, + "epoch": 9.43295530353569, + "step": 28280 + }, + { + "epoch": 9.43295530353569, + "ref_ce_loss": 0.06179404631257057, + "step": 28280 + }, + { + "epoch": 9.436290860573715, + "loss": 0.3536, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "grad_norm": 2.1430957317352295, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "learning_rate": 6.648254274720644e-06, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "loss": 0.5729732513427734, + "step": 28290 + }, + { + "ce_loss": 0.0864250436425209, + "epoch": 9.436290860573715, + "step": 28290 + }, + { + "distill_loss": 0.2062661051750183, + "epoch": 9.436290860573715, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "ref_ce_loss": 0.10020098090171814, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "loss": 0.25513356924057007, + "step": 28290 + }, + { + "ce_loss": 0.03391078859567642, + "epoch": 9.436290860573715, + "step": 28290 + }, + { + "distill_loss": 0.1471998691558838, + "epoch": 9.436290860573715, + "step": 28290 + }, + { + "epoch": 9.436290860573715, + "ref_ce_loss": 0.05318285524845123, + "step": 28290 + }, + { + "epoch": 9.439626417611741, + "loss": 0.3672, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "grad_norm": 3.0576388835906982, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "learning_rate": 6.570024939636765e-06, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "loss": 0.3838430643081665, + "step": 28300 + }, + { + "ce_loss": 0.05252906307578087, + "epoch": 9.439626417611741, + "step": 28300 + }, + { + "distill_loss": 0.18925431370735168, + "epoch": 9.439626417611741, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "ref_ce_loss": 0.06236163154244423, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "loss": 0.3756336271762848, + "step": 28300 + }, + { + "ce_loss": 0.03550528734922409, + "epoch": 9.439626417611741, + "step": 28300 + }, + { + "distill_loss": 0.1285904347896576, + "epoch": 9.439626417611741, + "step": 28300 + }, + { + "epoch": 9.439626417611741, + "ref_ce_loss": 0.05124359950423241, + "step": 28300 + }, + { + "epoch": 9.442961974649766, + "loss": 0.4092, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "grad_norm": 1.1369801759719849, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "learning_rate": 6.49225477966855e-06, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "loss": 0.3174965977668762, + "step": 28310 + }, + { + "ce_loss": 0.031249184161424637, + "epoch": 9.442961974649766, + "step": 28310 + }, + { + "distill_loss": 0.143992617726326, + "epoch": 9.442961974649766, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "ref_ce_loss": 0.04668223857879639, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "loss": 0.2718362808227539, + "step": 28310 + }, + { + "ce_loss": 0.03560258075594902, + "epoch": 9.442961974649766, + "step": 28310 + }, + { + "distill_loss": 0.17445442080497742, + "epoch": 9.442961974649766, + "step": 28310 + }, + { + "epoch": 9.442961974649766, + "ref_ce_loss": 0.06162376329302788, + "step": 28310 + }, + { + "epoch": 9.446297531687792, + "loss": 0.3277, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "grad_norm": 1.1759134531021118, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "learning_rate": 6.414943885582192e-06, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "loss": 0.31244683265686035, + "step": 28320 + }, + { + "ce_loss": 0.029821570962667465, + "epoch": 9.446297531687792, + "step": 28320 + }, + { + "distill_loss": 0.1645803302526474, + "epoch": 9.446297531687792, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "ref_ce_loss": 0.06684327870607376, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "loss": 0.3090226352214813, + "step": 28320 + }, + { + "ce_loss": 0.051436956971883774, + "epoch": 9.446297531687792, + "step": 28320 + }, + { + "distill_loss": 0.178633451461792, + "epoch": 9.446297531687792, + "step": 28320 + }, + { + "epoch": 9.446297531687792, + "ref_ce_loss": 0.07815413177013397, + "step": 28320 + }, + { + "epoch": 9.449633088725816, + "loss": 0.3709, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "grad_norm": 1.9918997287750244, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "learning_rate": 6.338092347607782e-06, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "loss": 0.4164873957633972, + "step": 28330 + }, + { + "ce_loss": 0.06631603091955185, + "epoch": 9.449633088725816, + "step": 28330 + }, + { + "distill_loss": 0.20923581719398499, + "epoch": 9.449633088725816, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "ref_ce_loss": 0.05797014757990837, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "loss": 0.27249330282211304, + "step": 28330 + }, + { + "ce_loss": 0.06557907909154892, + "epoch": 9.449633088725816, + "step": 28330 + }, + { + "distill_loss": 0.150743305683136, + "epoch": 9.449633088725816, + "step": 28330 + }, + { + "epoch": 9.449633088725816, + "ref_ce_loss": 0.05572926625609398, + "step": 28330 + }, + { + "epoch": 9.452968645763843, + "loss": 0.3418, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "grad_norm": 1.0681581497192383, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "learning_rate": 6.26170025543944e-06, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "loss": 0.4454483985900879, + "step": 28340 + }, + { + "ce_loss": 0.05483581870794296, + "epoch": 9.452968645763843, + "step": 28340 + }, + { + "distill_loss": 0.1950834095478058, + "epoch": 9.452968645763843, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "ref_ce_loss": 0.08751071244478226, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "loss": 0.28754252195358276, + "step": 28340 + }, + { + "ce_loss": 0.01936127245426178, + "epoch": 9.452968645763843, + "step": 28340 + }, + { + "distill_loss": 0.1522999405860901, + "epoch": 9.452968645763843, + "step": 28340 + }, + { + "epoch": 9.452968645763843, + "ref_ce_loss": 0.08448221534490585, + "step": 28340 + }, + { + "epoch": 9.456304202801867, + "loss": 0.3307, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "grad_norm": 2.2357776165008545, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "learning_rate": 6.1857676982348675e-06, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "loss": 0.5820572376251221, + "step": 28350 + }, + { + "ce_loss": 0.0633431151509285, + "epoch": 9.456304202801867, + "step": 28350 + }, + { + "distill_loss": 0.19299493730068207, + "epoch": 9.456304202801867, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "ref_ce_loss": 0.0826629102230072, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "loss": 0.29847437143325806, + "step": 28350 + }, + { + "ce_loss": 0.025454547256231308, + "epoch": 9.456304202801867, + "step": 28350 + }, + { + "distill_loss": 0.16801771521568298, + "epoch": 9.456304202801867, + "step": 28350 + }, + { + "epoch": 9.456304202801867, + "ref_ce_loss": 0.06851398944854736, + "step": 28350 + }, + { + "epoch": 9.459639759839893, + "loss": 0.3551, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "grad_norm": 1.0600039958953857, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "learning_rate": 6.110294764615576e-06, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "loss": 0.3243717849254608, + "step": 28360 + }, + { + "ce_loss": 0.034493587911129, + "epoch": 9.459639759839893, + "step": 28360 + }, + { + "distill_loss": 0.1711745709180832, + "epoch": 9.459639759839893, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "ref_ce_loss": 0.07968221604824066, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "loss": 0.353845477104187, + "step": 28360 + }, + { + "ce_loss": 0.04074955731630325, + "epoch": 9.459639759839893, + "step": 28360 + }, + { + "distill_loss": 0.17961429059505463, + "epoch": 9.459639759839893, + "step": 28360 + }, + { + "epoch": 9.459639759839893, + "ref_ce_loss": 0.0658039078116417, + "step": 28360 + }, + { + "epoch": 9.462975316877918, + "loss": 0.3469, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "grad_norm": 1.6155695915222168, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "learning_rate": 6.035281542666571e-06, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "loss": 0.28634515404701233, + "step": 28370 + }, + { + "ce_loss": 0.03958677500486374, + "epoch": 9.462975316877918, + "step": 28370 + }, + { + "distill_loss": 0.1375505030155182, + "epoch": 9.462975316877918, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "ref_ce_loss": 0.07145129144191742, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "loss": 0.41217994689941406, + "step": 28370 + }, + { + "ce_loss": 0.04383128136396408, + "epoch": 9.462975316877918, + "step": 28370 + }, + { + "distill_loss": 0.1717100888490677, + "epoch": 9.462975316877918, + "step": 28370 + }, + { + "epoch": 9.462975316877918, + "ref_ce_loss": 0.06356970965862274, + "step": 28370 + }, + { + "epoch": 9.466310873915944, + "loss": 0.3584, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "grad_norm": 0.9267465472221375, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "learning_rate": 5.960728119936399e-06, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "loss": 0.3397396504878998, + "step": 28380 + }, + { + "ce_loss": 0.060050420463085175, + "epoch": 9.466310873915944, + "step": 28380 + }, + { + "distill_loss": 0.17834344506263733, + "epoch": 9.466310873915944, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "ref_ce_loss": 0.07434417307376862, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "loss": 0.280172735452652, + "step": 28380 + }, + { + "ce_loss": 0.04056550934910774, + "epoch": 9.466310873915944, + "step": 28380 + }, + { + "distill_loss": 0.12052446603775024, + "epoch": 9.466310873915944, + "step": 28380 + }, + { + "epoch": 9.466310873915944, + "ref_ce_loss": 0.054708823561668396, + "step": 28380 + }, + { + "epoch": 9.469646430953969, + "loss": 0.3581, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "grad_norm": 1.1847563982009888, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "learning_rate": 5.88663458343679e-06, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "loss": 0.36242082715034485, + "step": 28390 + }, + { + "ce_loss": 0.05414927378296852, + "epoch": 9.469646430953969, + "step": 28390 + }, + { + "distill_loss": 0.15785610675811768, + "epoch": 9.469646430953969, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "ref_ce_loss": 0.06853984296321869, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "loss": 0.2403300404548645, + "step": 28390 + }, + { + "ce_loss": 0.028485354036092758, + "epoch": 9.469646430953969, + "step": 28390 + }, + { + "distill_loss": 0.15575310587882996, + "epoch": 9.469646430953969, + "step": 28390 + }, + { + "epoch": 9.469646430953969, + "ref_ce_loss": 0.036536816507577896, + "step": 28390 + }, + { + "epoch": 9.472981987991995, + "loss": 0.3638, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "grad_norm": 1.3311790227890015, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "learning_rate": 5.813001019643016e-06, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "loss": 0.3263969123363495, + "step": 28400 + }, + { + "ce_loss": 0.044232018291950226, + "epoch": 9.472981987991995, + "step": 28400 + }, + { + "distill_loss": 0.16511671245098114, + "epoch": 9.472981987991995, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "ref_ce_loss": 0.08508986979722977, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "loss": 0.2716098427772522, + "step": 28400 + }, + { + "ce_loss": 0.0314282663166523, + "epoch": 9.472981987991995, + "step": 28400 + }, + { + "distill_loss": 0.15377303957939148, + "epoch": 9.472981987991995, + "step": 28400 + }, + { + "epoch": 9.472981987991995, + "ref_ce_loss": 0.05766654759645462, + "step": 28400 + }, + { + "epoch": 9.47631754503002, + "loss": 0.3144, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "grad_norm": 1.8568049669265747, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "learning_rate": 5.739827514493357e-06, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "loss": 0.32465481758117676, + "step": 28410 + }, + { + "ce_loss": 0.04101501405239105, + "epoch": 9.47631754503002, + "step": 28410 + }, + { + "distill_loss": 0.14696335792541504, + "epoch": 9.47631754503002, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "ref_ce_loss": 0.06247838959097862, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "loss": 0.3745761215686798, + "step": 28410 + }, + { + "ce_loss": 0.05117378756403923, + "epoch": 9.47631754503002, + "step": 28410 + }, + { + "distill_loss": 0.18519654870033264, + "epoch": 9.47631754503002, + "step": 28410 + }, + { + "epoch": 9.47631754503002, + "ref_ce_loss": 0.06911353766918182, + "step": 28410 + }, + { + "epoch": 9.479653102068045, + "loss": 0.3539, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "grad_norm": 1.837537407875061, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "learning_rate": 5.667114153389142e-06, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "loss": 0.3297516405582428, + "step": 28420 + }, + { + "ce_loss": 0.029736388474702835, + "epoch": 9.479653102068045, + "step": 28420 + }, + { + "distill_loss": 0.20523138344287872, + "epoch": 9.479653102068045, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "ref_ce_loss": 0.07494564354419708, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "loss": 0.2599937915802002, + "step": 28420 + }, + { + "ce_loss": 0.03557970002293587, + "epoch": 9.479653102068045, + "step": 28420 + }, + { + "distill_loss": 0.16381414234638214, + "epoch": 9.479653102068045, + "step": 28420 + }, + { + "epoch": 9.479653102068045, + "ref_ce_loss": 0.043581776320934296, + "step": 28420 + }, + { + "epoch": 9.48298865910607, + "loss": 0.3452, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "grad_norm": 1.0753458738327026, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "learning_rate": 5.594861021194709e-06, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "loss": 0.2921322286128998, + "step": 28430 + }, + { + "ce_loss": 0.04107680916786194, + "epoch": 9.48298865910607, + "step": 28430 + }, + { + "distill_loss": 0.1625596284866333, + "epoch": 9.48298865910607, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "ref_ce_loss": 0.06385013461112976, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "loss": 0.22679473459720612, + "step": 28430 + }, + { + "ce_loss": 0.01926489546895027, + "epoch": 9.48298865910607, + "step": 28430 + }, + { + "distill_loss": 0.16145484149456024, + "epoch": 9.48298865910607, + "step": 28430 + }, + { + "epoch": 9.48298865910607, + "ref_ce_loss": 0.045216940343379974, + "step": 28430 + }, + { + "epoch": 9.486324216144096, + "loss": 0.3659, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "grad_norm": 1.2148197889328003, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "learning_rate": 5.523068202237136e-06, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "loss": 0.42336106300354004, + "step": 28440 + }, + { + "ce_loss": 0.05302092432975769, + "epoch": 9.486324216144096, + "step": 28440 + }, + { + "distill_loss": 0.22245055437088013, + "epoch": 9.486324216144096, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "ref_ce_loss": 0.06507351249456406, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "loss": 0.28218674659729004, + "step": 28440 + }, + { + "ce_loss": 0.02374487742781639, + "epoch": 9.486324216144096, + "step": 28440 + }, + { + "distill_loss": 0.16830646991729736, + "epoch": 9.486324216144096, + "step": 28440 + }, + { + "epoch": 9.486324216144096, + "ref_ce_loss": 0.05579949542880058, + "step": 28440 + }, + { + "epoch": 9.48965977318212, + "loss": 0.3579, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "grad_norm": 0.9573124051094055, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "learning_rate": 5.45173578030651e-06, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "loss": 0.29650261998176575, + "step": 28450 + }, + { + "ce_loss": 0.029416002333164215, + "epoch": 9.48965977318212, + "step": 28450 + }, + { + "distill_loss": 0.19150586426258087, + "epoch": 9.48965977318212, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "ref_ce_loss": 0.057308148592710495, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "loss": 0.5148458480834961, + "step": 28450 + }, + { + "ce_loss": 0.06594638526439667, + "epoch": 9.48965977318212, + "step": 28450 + }, + { + "distill_loss": 0.19446289539337158, + "epoch": 9.48965977318212, + "step": 28450 + }, + { + "epoch": 9.48965977318212, + "ref_ce_loss": 0.06992293149232864, + "step": 28450 + }, + { + "epoch": 9.492995330220147, + "loss": 0.3606, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "grad_norm": 1.1321545839309692, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "learning_rate": 5.380863838655348e-06, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "loss": 0.25574859976768494, + "step": 28460 + }, + { + "ce_loss": 0.03863326460123062, + "epoch": 9.492995330220147, + "step": 28460 + }, + { + "distill_loss": 0.1366451531648636, + "epoch": 9.492995330220147, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "ref_ce_loss": 0.056483346968889236, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "loss": 0.3196426331996918, + "step": 28460 + }, + { + "ce_loss": 0.043017297983169556, + "epoch": 9.492995330220147, + "step": 28460 + }, + { + "distill_loss": 0.19430020451545715, + "epoch": 9.492995330220147, + "step": 28460 + }, + { + "epoch": 9.492995330220147, + "ref_ce_loss": 0.0640929564833641, + "step": 28460 + }, + { + "epoch": 9.496330887258171, + "loss": 0.3627, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "grad_norm": 1.075714349746704, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "learning_rate": 5.31045245999886e-06, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "loss": 0.28867387771606445, + "step": 28470 + }, + { + "ce_loss": 0.03961672633886337, + "epoch": 9.496330887258171, + "step": 28470 + }, + { + "distill_loss": 0.17778590321540833, + "epoch": 9.496330887258171, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "ref_ce_loss": 0.047913145273923874, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "loss": 0.3131311535835266, + "step": 28470 + }, + { + "ce_loss": 0.04571129009127617, + "epoch": 9.496330887258171, + "step": 28470 + }, + { + "distill_loss": 0.18334239721298218, + "epoch": 9.496330887258171, + "step": 28470 + }, + { + "epoch": 9.496330887258171, + "ref_ce_loss": 0.08380914479494095, + "step": 28470 + }, + { + "epoch": 9.499666444296198, + "loss": 0.371, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "grad_norm": 0.9681311845779419, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "learning_rate": 5.240501726514735e-06, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "loss": 0.3067833185195923, + "step": 28480 + }, + { + "ce_loss": 0.030180882662534714, + "epoch": 9.499666444296198, + "step": 28480 + }, + { + "distill_loss": 0.16230109333992004, + "epoch": 9.499666444296198, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "ref_ce_loss": 0.055661167949438095, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "loss": 0.31161099672317505, + "step": 28480 + }, + { + "ce_loss": 0.04685712605714798, + "epoch": 9.499666444296198, + "step": 28480 + }, + { + "distill_loss": 0.15889328718185425, + "epoch": 9.499666444296198, + "step": 28480 + }, + { + "epoch": 9.499666444296198, + "ref_ce_loss": 0.06073801964521408, + "step": 28480 + }, + { + "epoch": 9.503002001334222, + "loss": 0.3329, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "grad_norm": 1.2925001382827759, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "learning_rate": 5.171011719842955e-06, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "loss": 0.39370524883270264, + "step": 28490 + }, + { + "ce_loss": 0.041692305356264114, + "epoch": 9.503002001334222, + "step": 28490 + }, + { + "distill_loss": 0.15185731649398804, + "epoch": 9.503002001334222, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "ref_ce_loss": 0.0779699832201004, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "loss": 0.39848214387893677, + "step": 28490 + }, + { + "ce_loss": 0.06291890889406204, + "epoch": 9.503002001334222, + "step": 28490 + }, + { + "distill_loss": 0.19283688068389893, + "epoch": 9.503002001334222, + "step": 28490 + }, + { + "epoch": 9.503002001334222, + "ref_ce_loss": 0.06672590225934982, + "step": 28490 + }, + { + "epoch": 9.506337558372248, + "loss": 0.3091, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "grad_norm": 1.1803241968154907, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "learning_rate": 5.101982521085846e-06, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "loss": 0.3626993000507355, + "step": 28500 + }, + { + "ce_loss": 0.06457323580980301, + "epoch": 9.506337558372248, + "step": 28500 + }, + { + "distill_loss": 0.20585688948631287, + "epoch": 9.506337558372248, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "ref_ce_loss": 0.0644054189324379, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "loss": 0.22315269708633423, + "step": 28500 + }, + { + "ce_loss": 0.021638058125972748, + "epoch": 9.506337558372248, + "step": 28500 + }, + { + "distill_loss": 0.14434072375297546, + "epoch": 9.506337558372248, + "step": 28500 + }, + { + "epoch": 9.506337558372248, + "ref_ce_loss": 0.046219129115343094, + "step": 28500 + }, + { + "epoch": 9.509673115410273, + "loss": 0.3629, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "grad_norm": 1.8690041303634644, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "learning_rate": 5.033414210807896e-06, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "loss": 0.3256169557571411, + "step": 28510 + }, + { + "ce_loss": 0.04394880309700966, + "epoch": 9.509673115410273, + "step": 28510 + }, + { + "distill_loss": 0.1812673956155777, + "epoch": 9.509673115410273, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "ref_ce_loss": 0.06298881769180298, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "loss": 0.3566301465034485, + "step": 28510 + }, + { + "ce_loss": 0.05997853726148605, + "epoch": 9.509673115410273, + "step": 28510 + }, + { + "distill_loss": 0.16302239894866943, + "epoch": 9.509673115410273, + "step": 28510 + }, + { + "epoch": 9.509673115410273, + "ref_ce_loss": 0.06253132224082947, + "step": 28510 + }, + { + "epoch": 9.513008672448299, + "loss": 0.3379, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "grad_norm": 0.9525768756866455, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "learning_rate": 4.9653068690357575e-06, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "loss": 0.5037875175476074, + "step": 28520 + }, + { + "ce_loss": 0.03574688360095024, + "epoch": 9.513008672448299, + "step": 28520 + }, + { + "distill_loss": 0.17958644032478333, + "epoch": 9.513008672448299, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "ref_ce_loss": 0.051626432687044144, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "loss": 0.3209737539291382, + "step": 28520 + }, + { + "ce_loss": 0.04713620990514755, + "epoch": 9.513008672448299, + "step": 28520 + }, + { + "distill_loss": 0.17461776733398438, + "epoch": 9.513008672448299, + "step": 28520 + }, + { + "epoch": 9.513008672448299, + "ref_ce_loss": 0.05999940261244774, + "step": 28520 + }, + { + "epoch": 9.516344229486323, + "loss": 0.3456, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "grad_norm": 1.2907161712646484, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "learning_rate": 4.897660575258023e-06, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "loss": 0.48610827326774597, + "step": 28530 + }, + { + "ce_loss": 0.05657501518726349, + "epoch": 9.516344229486323, + "step": 28530 + }, + { + "distill_loss": 0.2150333821773529, + "epoch": 9.516344229486323, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "ref_ce_loss": 0.08219249546527863, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "loss": 0.2625681161880493, + "step": 28530 + }, + { + "ce_loss": 0.024848783388733864, + "epoch": 9.516344229486323, + "step": 28530 + }, + { + "distill_loss": 0.15303784608840942, + "epoch": 9.516344229486323, + "step": 28530 + }, + { + "epoch": 9.516344229486323, + "ref_ce_loss": 0.05679015815258026, + "step": 28530 + }, + { + "epoch": 9.51967978652435, + "loss": 0.3443, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "grad_norm": 0.9048293828964233, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "learning_rate": 4.830475408425139e-06, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "loss": 0.3716161251068115, + "step": 28540 + }, + { + "ce_loss": 0.04324284568428993, + "epoch": 9.51967978652435, + "step": 28540 + }, + { + "distill_loss": 0.15873301029205322, + "epoch": 9.51967978652435, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "ref_ce_loss": 0.05497406795620918, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "loss": 0.40590211749076843, + "step": 28540 + }, + { + "ce_loss": 0.06277185678482056, + "epoch": 9.51967978652435, + "step": 28540 + }, + { + "distill_loss": 0.19825831055641174, + "epoch": 9.51967978652435, + "step": 28540 + }, + { + "epoch": 9.51967978652435, + "ref_ce_loss": 0.059562280774116516, + "step": 28540 + }, + { + "epoch": 9.523015343562374, + "loss": 0.3976, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "grad_norm": 1.2088080644607544, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "learning_rate": 4.76375144694945e-06, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "loss": 0.36790820956230164, + "step": 28550 + }, + { + "ce_loss": 0.051003482192754745, + "epoch": 9.523015343562374, + "step": 28550 + }, + { + "distill_loss": 0.2250767946243286, + "epoch": 9.523015343562374, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "ref_ce_loss": 0.06892771273851395, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "loss": 0.32154223322868347, + "step": 28550 + }, + { + "ce_loss": 0.0505017451941967, + "epoch": 9.523015343562374, + "step": 28550 + }, + { + "distill_loss": 0.17499235272407532, + "epoch": 9.523015343562374, + "step": 28550 + }, + { + "epoch": 9.523015343562374, + "ref_ce_loss": 0.04032899811863899, + "step": 28550 + }, + { + "epoch": 9.5263509006004, + "loss": 0.3422, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "grad_norm": 1.096779465675354, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "learning_rate": 4.697488768705016e-06, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "loss": 0.3258456587791443, + "step": 28560 + }, + { + "ce_loss": 0.04681219905614853, + "epoch": 9.5263509006004, + "step": 28560 + }, + { + "distill_loss": 0.15567587316036224, + "epoch": 9.5263509006004, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "ref_ce_loss": 0.056013528257608414, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "loss": 0.3023555278778076, + "step": 28560 + }, + { + "ce_loss": 0.033247340470552444, + "epoch": 9.5263509006004, + "step": 28560 + }, + { + "distill_loss": 0.16405390202999115, + "epoch": 9.5263509006004, + "step": 28560 + }, + { + "epoch": 9.5263509006004, + "ref_ce_loss": 0.06972135603427887, + "step": 28560 + }, + { + "epoch": 9.529686457638425, + "loss": 0.3583, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "grad_norm": 0.7542944550514221, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "learning_rate": 4.631687451027489e-06, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "loss": 0.29079288244247437, + "step": 28570 + }, + { + "ce_loss": 0.029417484998703003, + "epoch": 9.529686457638425, + "step": 28570 + }, + { + "distill_loss": 0.18318980932235718, + "epoch": 9.529686457638425, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "ref_ce_loss": 0.05650854483246803, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "loss": 0.2807072103023529, + "step": 28570 + }, + { + "ce_loss": 0.029622601345181465, + "epoch": 9.529686457638425, + "step": 28570 + }, + { + "distill_loss": 0.16439847648143768, + "epoch": 9.529686457638425, + "step": 28570 + }, + { + "epoch": 9.529686457638425, + "ref_ce_loss": 0.057991236448287964, + "step": 28570 + }, + { + "epoch": 9.533022014676451, + "loss": 0.308, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "grad_norm": 1.3596519231796265, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "learning_rate": 4.566347570714102e-06, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "loss": 0.22487899661064148, + "step": 28580 + }, + { + "ce_loss": 0.03615245968103409, + "epoch": 9.533022014676451, + "step": 28580 + }, + { + "distill_loss": 0.14556825160980225, + "epoch": 9.533022014676451, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "ref_ce_loss": 0.042983442544937134, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "loss": 0.5066213607788086, + "step": 28580 + }, + { + "ce_loss": 0.055620044469833374, + "epoch": 9.533022014676451, + "step": 28580 + }, + { + "distill_loss": 0.17550809681415558, + "epoch": 9.533022014676451, + "step": 28580 + }, + { + "epoch": 9.533022014676451, + "ref_ce_loss": 0.07425516843795776, + "step": 28580 + }, + { + "epoch": 9.536357571714476, + "loss": 0.3642, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "grad_norm": 0.8081924319267273, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "learning_rate": 4.5014692040235455e-06, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "loss": 0.4211300015449524, + "step": 28590 + }, + { + "ce_loss": 0.02656661719083786, + "epoch": 9.536357571714476, + "step": 28590 + }, + { + "distill_loss": 0.15321145951747894, + "epoch": 9.536357571714476, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "ref_ce_loss": 0.06345057487487793, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "loss": 0.3505942225456238, + "step": 28590 + }, + { + "ce_loss": 0.01590488851070404, + "epoch": 9.536357571714476, + "step": 28590 + }, + { + "distill_loss": 0.15594804286956787, + "epoch": 9.536357571714476, + "step": 28590 + }, + { + "epoch": 9.536357571714476, + "ref_ce_loss": 0.0463242270052433, + "step": 28590 + }, + { + "epoch": 9.539693128752502, + "loss": 0.3481, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "grad_norm": 1.1376338005065918, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "learning_rate": 4.437052426675781e-06, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "loss": 0.3184284567832947, + "step": 28600 + }, + { + "ce_loss": 0.025577332824468613, + "epoch": 9.539693128752502, + "step": 28600 + }, + { + "distill_loss": 0.14735321700572968, + "epoch": 9.539693128752502, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "ref_ce_loss": 0.05905190855264664, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "loss": 1.036750316619873, + "step": 28600 + }, + { + "ce_loss": 0.051478635519742966, + "epoch": 9.539693128752502, + "step": 28600 + }, + { + "distill_loss": 0.19417119026184082, + "epoch": 9.539693128752502, + "step": 28600 + }, + { + "epoch": 9.539693128752502, + "ref_ce_loss": 0.09487366676330566, + "step": 28600 + }, + { + "epoch": 9.543028685790526, + "loss": 0.3749, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "grad_norm": 1.288905143737793, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "learning_rate": 4.373097313852182e-06, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "loss": 0.33301958441734314, + "step": 28610 + }, + { + "ce_loss": 0.03987917676568031, + "epoch": 9.543028685790526, + "step": 28610 + }, + { + "distill_loss": 0.20812353491783142, + "epoch": 9.543028685790526, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "ref_ce_loss": 0.0582069493830204, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "loss": 0.5744978785514832, + "step": 28610 + }, + { + "ce_loss": 0.07136088609695435, + "epoch": 9.543028685790526, + "step": 28610 + }, + { + "distill_loss": 0.18274860084056854, + "epoch": 9.543028685790526, + "step": 28610 + }, + { + "epoch": 9.543028685790526, + "ref_ce_loss": 0.046957675367593765, + "step": 28610 + }, + { + "epoch": 9.546364242828552, + "loss": 0.3571, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "grad_norm": 1.8873906135559082, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "learning_rate": 4.309603940195261e-06, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "loss": 0.2871871888637543, + "step": 28620 + }, + { + "ce_loss": 0.022181112319231033, + "epoch": 9.546364242828552, + "step": 28620 + }, + { + "distill_loss": 0.16967681050300598, + "epoch": 9.546364242828552, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "ref_ce_loss": 0.04003141447901726, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "loss": 0.27087387442588806, + "step": 28620 + }, + { + "ce_loss": 0.019921233877539635, + "epoch": 9.546364242828552, + "step": 28620 + }, + { + "distill_loss": 0.1713012307882309, + "epoch": 9.546364242828552, + "step": 28620 + }, + { + "epoch": 9.546364242828552, + "ref_ce_loss": 0.05143344774842262, + "step": 28620 + }, + { + "epoch": 9.549699799866577, + "loss": 0.3838, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "grad_norm": 2.1031012535095215, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "learning_rate": 4.246572379808545e-06, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "loss": 0.24774429202079773, + "step": 28630 + }, + { + "ce_loss": 0.021202141419053078, + "epoch": 9.549699799866577, + "step": 28630 + }, + { + "distill_loss": 0.16074173152446747, + "epoch": 9.549699799866577, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "ref_ce_loss": 0.05151132866740227, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "loss": 0.5488268733024597, + "step": 28630 + }, + { + "ce_loss": 0.046389929950237274, + "epoch": 9.549699799866577, + "step": 28630 + }, + { + "distill_loss": 0.1870126873254776, + "epoch": 9.549699799866577, + "step": 28630 + }, + { + "epoch": 9.549699799866577, + "ref_ce_loss": 0.06963532418012619, + "step": 28630 + }, + { + "epoch": 9.553035356904603, + "loss": 0.3577, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "grad_norm": 0.8177258968353271, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "learning_rate": 4.18400270625674e-06, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "loss": 0.2963216304779053, + "step": 28640 + }, + { + "ce_loss": 0.038040950894355774, + "epoch": 9.553035356904603, + "step": 28640 + }, + { + "distill_loss": 0.18547900021076202, + "epoch": 9.553035356904603, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "ref_ce_loss": 0.05713136866688728, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "loss": 0.42804527282714844, + "step": 28640 + }, + { + "ce_loss": 0.05795781686902046, + "epoch": 9.553035356904603, + "step": 28640 + }, + { + "distill_loss": 0.15100808441638947, + "epoch": 9.553035356904603, + "step": 28640 + }, + { + "epoch": 9.553035356904603, + "ref_ce_loss": 0.0495360866189003, + "step": 28640 + }, + { + "epoch": 9.556370913942628, + "loss": 0.3788, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "grad_norm": 1.4477307796478271, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "learning_rate": 4.121894992565345e-06, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "loss": 0.2551484704017639, + "step": 28650 + }, + { + "ce_loss": 0.03109881281852722, + "epoch": 9.556370913942628, + "step": 28650 + }, + { + "distill_loss": 0.17904944717884064, + "epoch": 9.556370913942628, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "ref_ce_loss": 0.04486895352602005, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "loss": 0.2952268719673157, + "step": 28650 + }, + { + "ce_loss": 0.0575919933617115, + "epoch": 9.556370913942628, + "step": 28650 + }, + { + "distill_loss": 0.17165498435497284, + "epoch": 9.556370913942628, + "step": 28650 + }, + { + "epoch": 9.556370913942628, + "ref_ce_loss": 0.06577247381210327, + "step": 28650 + }, + { + "epoch": 9.559706470980654, + "loss": 0.319, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "grad_norm": 1.4955540895462036, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "learning_rate": 4.060249311220687e-06, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "loss": 0.3374966084957123, + "step": 28660 + }, + { + "ce_loss": 0.044201888144016266, + "epoch": 9.559706470980654, + "step": 28660 + }, + { + "distill_loss": 0.1826128214597702, + "epoch": 9.559706470980654, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "ref_ce_loss": 0.05706416070461273, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "loss": 0.30890583992004395, + "step": 28660 + }, + { + "ce_loss": 0.045475609600543976, + "epoch": 9.559706470980654, + "step": 28660 + }, + { + "distill_loss": 0.1853654533624649, + "epoch": 9.559706470980654, + "step": 28660 + }, + { + "epoch": 9.559706470980654, + "ref_ce_loss": 0.03793710097670555, + "step": 28660 + }, + { + "epoch": 9.563042028018678, + "loss": 0.342, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "grad_norm": 1.226629614830017, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "learning_rate": 3.9990657341700156e-06, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "loss": 0.30386072397232056, + "step": 28670 + }, + { + "ce_loss": 0.023753361776471138, + "epoch": 9.563042028018678, + "step": 28670 + }, + { + "distill_loss": 0.14758071303367615, + "epoch": 9.563042028018678, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "ref_ce_loss": 0.0571836493909359, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "loss": 0.2814745008945465, + "step": 28670 + }, + { + "ce_loss": 0.040350768715143204, + "epoch": 9.563042028018678, + "step": 28670 + }, + { + "distill_loss": 0.17561466991901398, + "epoch": 9.563042028018678, + "step": 28670 + }, + { + "epoch": 9.563042028018678, + "ref_ce_loss": 0.06502601504325867, + "step": 28670 + }, + { + "epoch": 9.566377585056705, + "loss": 0.3655, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "grad_norm": 0.9547922015190125, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "learning_rate": 3.938344332821053e-06, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "loss": 0.32980644702911377, + "step": 28680 + }, + { + "ce_loss": 0.03864011913537979, + "epoch": 9.566377585056705, + "step": 28680 + }, + { + "distill_loss": 0.20437286794185638, + "epoch": 9.566377585056705, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "ref_ce_loss": 0.08655925840139389, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "loss": 0.3018178641796112, + "step": 28680 + }, + { + "ce_loss": 0.04785173386335373, + "epoch": 9.566377585056705, + "step": 28680 + }, + { + "distill_loss": 0.17461104691028595, + "epoch": 9.566377585056705, + "step": 28680 + }, + { + "epoch": 9.566377585056705, + "ref_ce_loss": 0.07911676168441772, + "step": 28680 + }, + { + "epoch": 9.569713142094729, + "loss": 0.3689, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "grad_norm": 1.2437167167663574, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "learning_rate": 3.878085178042312e-06, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "loss": 0.614569365978241, + "step": 28690 + }, + { + "ce_loss": 0.04580362141132355, + "epoch": 9.569713142094729, + "step": 28690 + }, + { + "distill_loss": 0.16972200572490692, + "epoch": 9.569713142094729, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "ref_ce_loss": 0.06595150381326675, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "loss": 0.22208955883979797, + "step": 28690 + }, + { + "ce_loss": 0.020961729809641838, + "epoch": 9.569713142094729, + "step": 28690 + }, + { + "distill_loss": 0.15288269519805908, + "epoch": 9.569713142094729, + "step": 28690 + }, + { + "epoch": 9.569713142094729, + "ref_ce_loss": 0.04804127290844917, + "step": 28690 + }, + { + "epoch": 9.573048699132755, + "loss": 0.3584, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "grad_norm": 1.0217654705047607, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "learning_rate": 3.8182883401626015e-06, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "loss": 0.30012840032577515, + "step": 28700 + }, + { + "ce_loss": 0.037350449711084366, + "epoch": 9.573048699132755, + "step": 28700 + }, + { + "distill_loss": 0.1833217740058899, + "epoch": 9.573048699132755, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "ref_ce_loss": 0.05961844325065613, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "loss": 0.36112287640571594, + "step": 28700 + }, + { + "ce_loss": 0.06320945918560028, + "epoch": 9.573048699132755, + "step": 28700 + }, + { + "distill_loss": 0.20540209114551544, + "epoch": 9.573048699132755, + "step": 28700 + }, + { + "epoch": 9.573048699132755, + "ref_ce_loss": 0.06766784191131592, + "step": 28700 + }, + { + "epoch": 9.57638425617078, + "loss": 0.3146, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "grad_norm": 1.1562711000442505, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "learning_rate": 3.758953888971295e-06, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "loss": 0.3072027564048767, + "step": 28710 + }, + { + "ce_loss": 0.04621975123882294, + "epoch": 9.57638425617078, + "step": 28710 + }, + { + "distill_loss": 0.19785268604755402, + "epoch": 9.57638425617078, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "ref_ce_loss": 0.06301284581422806, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "loss": 0.275651216506958, + "step": 28710 + }, + { + "ce_loss": 0.04094674810767174, + "epoch": 9.57638425617078, + "step": 28710 + }, + { + "distill_loss": 0.13219065964221954, + "epoch": 9.57638425617078, + "step": 28710 + }, + { + "epoch": 9.57638425617078, + "ref_ce_loss": 0.07614676654338837, + "step": 28710 + }, + { + "epoch": 9.579719813208806, + "loss": 0.2964, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "grad_norm": 1.2158347368240356, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "learning_rate": 3.7000818937181546e-06, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "loss": 0.24544508755207062, + "step": 28720 + }, + { + "ce_loss": 0.03466008976101875, + "epoch": 9.579719813208806, + "step": 28720 + }, + { + "distill_loss": 0.16733793914318085, + "epoch": 9.579719813208806, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "ref_ce_loss": 0.04320121556520462, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "loss": 0.3183065950870514, + "step": 28720 + }, + { + "ce_loss": 0.04318279027938843, + "epoch": 9.579719813208806, + "step": 28720 + }, + { + "distill_loss": 0.16793055832386017, + "epoch": 9.579719813208806, + "step": 28720 + }, + { + "epoch": 9.579719813208806, + "ref_ce_loss": 0.053741853684186935, + "step": 28720 + }, + { + "epoch": 9.58305537024683, + "loss": 0.3425, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "grad_norm": 0.9732711315155029, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "learning_rate": 3.6416724231130186e-06, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "loss": 0.3047448694705963, + "step": 28730 + }, + { + "ce_loss": 0.02543429099023342, + "epoch": 9.58305537024683, + "step": 28730 + }, + { + "distill_loss": 0.18523433804512024, + "epoch": 9.58305537024683, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "ref_ce_loss": 0.046858031302690506, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "loss": 0.23915736377239227, + "step": 28730 + }, + { + "ce_loss": 0.02462298981845379, + "epoch": 9.58305537024683, + "step": 28730 + }, + { + "distill_loss": 0.16042344272136688, + "epoch": 9.58305537024683, + "step": 28730 + }, + { + "epoch": 9.58305537024683, + "ref_ce_loss": 0.05374724790453911, + "step": 28730 + }, + { + "epoch": 9.586390927284857, + "loss": 0.3176, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "grad_norm": 1.7445679903030396, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "learning_rate": 3.5837255453261554e-06, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "loss": 0.32195746898651123, + "step": 28740 + }, + { + "ce_loss": 0.0703810602426529, + "epoch": 9.586390927284857, + "step": 28740 + }, + { + "distill_loss": 0.17174682021141052, + "epoch": 9.586390927284857, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "ref_ce_loss": 0.06239185482263565, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "loss": 0.3455526530742645, + "step": 28740 + }, + { + "ce_loss": 0.03157772123813629, + "epoch": 9.586390927284857, + "step": 28740 + }, + { + "distill_loss": 0.17570674419403076, + "epoch": 9.586390927284857, + "step": 28740 + }, + { + "epoch": 9.586390927284857, + "ref_ce_loss": 0.056114934384822845, + "step": 28740 + }, + { + "epoch": 9.589726484322881, + "loss": 0.3097, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "grad_norm": 1.0213404893875122, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "learning_rate": 3.5262413279876894e-06, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "loss": 0.4088827967643738, + "step": 28750 + }, + { + "ce_loss": 0.07769577205181122, + "epoch": 9.589726484322881, + "step": 28750 + }, + { + "distill_loss": 0.2127758264541626, + "epoch": 9.589726484322881, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "ref_ce_loss": 0.06413999199867249, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "loss": 0.529901385307312, + "step": 28750 + }, + { + "ce_loss": 0.03473932668566704, + "epoch": 9.589726484322881, + "step": 28750 + }, + { + "distill_loss": 0.1991306096315384, + "epoch": 9.589726484322881, + "step": 28750 + }, + { + "epoch": 9.589726484322881, + "ref_ce_loss": 0.06662367284297943, + "step": 28750 + }, + { + "epoch": 9.593062041360907, + "loss": 0.343, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "grad_norm": 1.1475307941436768, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "learning_rate": 3.4692198381879536e-06, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "loss": 0.32468339800834656, + "step": 28760 + }, + { + "ce_loss": 0.04863545671105385, + "epoch": 9.593062041360907, + "step": 28760 + }, + { + "distill_loss": 0.16399593651294708, + "epoch": 9.593062041360907, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "ref_ce_loss": 0.0735296681523323, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "loss": 0.42887789011001587, + "step": 28760 + }, + { + "ce_loss": 0.05044868215918541, + "epoch": 9.593062041360907, + "step": 28760 + }, + { + "distill_loss": 0.1644851565361023, + "epoch": 9.593062041360907, + "step": 28760 + }, + { + "epoch": 9.593062041360907, + "ref_ce_loss": 0.05538942664861679, + "step": 28760 + }, + { + "epoch": 9.596397598398932, + "loss": 0.331, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "grad_norm": 6.332052230834961, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "learning_rate": 3.412661142477136e-06, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "loss": 0.30600395798683167, + "step": 28770 + }, + { + "ce_loss": 0.036587513983249664, + "epoch": 9.596397598398932, + "step": 28770 + }, + { + "distill_loss": 0.1614750772714615, + "epoch": 9.596397598398932, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "ref_ce_loss": 0.07749360054731369, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "loss": 0.3054625689983368, + "step": 28770 + }, + { + "ce_loss": 0.06228421628475189, + "epoch": 9.596397598398932, + "step": 28770 + }, + { + "distill_loss": 0.1685221642255783, + "epoch": 9.596397598398932, + "step": 28770 + }, + { + "epoch": 9.596397598398932, + "ref_ce_loss": 0.07439344376325607, + "step": 28770 + }, + { + "epoch": 9.599733155436958, + "loss": 0.3373, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "grad_norm": 1.7799866199493408, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "learning_rate": 3.356565306865367e-06, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "loss": 0.27912962436676025, + "step": 28780 + }, + { + "ce_loss": 0.027723034843802452, + "epoch": 9.599733155436958, + "step": 28780 + }, + { + "distill_loss": 0.17505702376365662, + "epoch": 9.599733155436958, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "ref_ce_loss": 0.05043905973434448, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "loss": 0.21506954729557037, + "step": 28780 + }, + { + "ce_loss": 0.036209072917699814, + "epoch": 9.599733155436958, + "step": 28780 + }, + { + "distill_loss": 0.12021758407354355, + "epoch": 9.599733155436958, + "step": 28780 + }, + { + "epoch": 9.599733155436958, + "ref_ce_loss": 0.058201637119054794, + "step": 28780 + }, + { + "epoch": 9.603068712474983, + "loss": 0.3961, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "grad_norm": 1.1351579427719116, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "learning_rate": 3.300932396822409e-06, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "loss": 0.377149760723114, + "step": 28790 + }, + { + "ce_loss": 0.02792281098663807, + "epoch": 9.603068712474983, + "step": 28790 + }, + { + "distill_loss": 0.14689025282859802, + "epoch": 9.603068712474983, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "ref_ce_loss": 0.05250436067581177, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "loss": 0.3787650465965271, + "step": 28790 + }, + { + "ce_loss": 0.05750597268342972, + "epoch": 9.603068712474983, + "step": 28790 + }, + { + "distill_loss": 0.17340369522571564, + "epoch": 9.603068712474983, + "step": 28790 + }, + { + "epoch": 9.603068712474983, + "ref_ce_loss": 0.07258488237857819, + "step": 28790 + }, + { + "epoch": 9.606404269513009, + "loss": 0.3483, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "grad_norm": 1.082783818244934, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "learning_rate": 3.245762477277969e-06, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "loss": 0.3081613779067993, + "step": 28800 + }, + { + "ce_loss": 0.01638505607843399, + "epoch": 9.606404269513009, + "step": 28800 + }, + { + "distill_loss": 0.18335667252540588, + "epoch": 9.606404269513009, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "ref_ce_loss": 0.059051476418972015, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "loss": 0.28536441922187805, + "step": 28800 + }, + { + "ce_loss": 0.04424785077571869, + "epoch": 9.606404269513009, + "step": 28800 + }, + { + "distill_loss": 0.18336057662963867, + "epoch": 9.606404269513009, + "step": 28800 + }, + { + "epoch": 9.606404269513009, + "ref_ce_loss": 0.05742606520652771, + "step": 28800 + }, + { + "epoch": 9.609739826551033, + "loss": 0.3274, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "grad_norm": 1.2012810707092285, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "learning_rate": 3.1910556126212963e-06, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "loss": 0.3235844373703003, + "step": 28810 + }, + { + "ce_loss": 0.06070598587393761, + "epoch": 9.609739826551033, + "step": 28810 + }, + { + "distill_loss": 0.1533758044242859, + "epoch": 9.609739826551033, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "ref_ce_loss": 0.061860065907239914, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "loss": 0.30414530634880066, + "step": 28810 + }, + { + "ce_loss": 0.03909186273813248, + "epoch": 9.609739826551033, + "step": 28810 + }, + { + "distill_loss": 0.1880081593990326, + "epoch": 9.609739826551033, + "step": 28810 + }, + { + "epoch": 9.609739826551033, + "ref_ce_loss": 0.046735141426324844, + "step": 28810 + }, + { + "epoch": 9.61307538358906, + "loss": 0.3513, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "grad_norm": 1.2907757759094238, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "learning_rate": 3.1368118667010505e-06, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "loss": 0.34699732065200806, + "step": 28820 + }, + { + "ce_loss": 0.020871445536613464, + "epoch": 9.61307538358906, + "step": 28820 + }, + { + "distill_loss": 0.1841685175895691, + "epoch": 9.61307538358906, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "ref_ce_loss": 0.06769346445798874, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "loss": 0.4415394067764282, + "step": 28820 + }, + { + "ce_loss": 0.028012122958898544, + "epoch": 9.61307538358906, + "step": 28820 + }, + { + "distill_loss": 0.16002513468265533, + "epoch": 9.61307538358906, + "step": 28820 + }, + { + "epoch": 9.61307538358906, + "ref_ce_loss": 0.05012548342347145, + "step": 28820 + }, + { + "epoch": 9.616410940627084, + "loss": 0.3138, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "grad_norm": 1.590085506439209, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "learning_rate": 3.083031302825612e-06, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "loss": 0.3491607904434204, + "step": 28830 + }, + { + "ce_loss": 0.037974365055561066, + "epoch": 9.616410940627084, + "step": 28830 + }, + { + "distill_loss": 0.17866151034832, + "epoch": 9.616410940627084, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "ref_ce_loss": 0.05684041231870651, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "loss": 0.36506855487823486, + "step": 28830 + }, + { + "ce_loss": 0.05224547162652016, + "epoch": 9.616410940627084, + "step": 28830 + }, + { + "distill_loss": 0.21156731247901917, + "epoch": 9.616410940627084, + "step": 28830 + }, + { + "epoch": 9.616410940627084, + "ref_ce_loss": 0.07331357151269913, + "step": 28830 + }, + { + "epoch": 9.61974649766511, + "loss": 0.3419, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "grad_norm": 1.2835090160369873, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "learning_rate": 3.0297139837627273e-06, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "loss": 0.46609291434288025, + "step": 28840 + }, + { + "ce_loss": 0.0682363361120224, + "epoch": 9.61974649766511, + "step": 28840 + }, + { + "distill_loss": 0.17836935818195343, + "epoch": 9.61974649766511, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "ref_ce_loss": 0.07604022324085236, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "loss": 0.2486204355955124, + "step": 28840 + }, + { + "ce_loss": 0.028242826461791992, + "epoch": 9.61974649766511, + "step": 28840 + }, + { + "distill_loss": 0.14386576414108276, + "epoch": 9.61974649766511, + "step": 28840 + }, + { + "epoch": 9.61974649766511, + "ref_ce_loss": 0.05152817815542221, + "step": 28840 + }, + { + "epoch": 9.623082054703135, + "loss": 0.3885, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "grad_norm": 1.1511974334716797, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "learning_rate": 2.9768599717393763e-06, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "loss": 0.23758172988891602, + "step": 28850 + }, + { + "ce_loss": 0.02366476133465767, + "epoch": 9.623082054703135, + "step": 28850 + }, + { + "distill_loss": 0.15192754566669464, + "epoch": 9.623082054703135, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "ref_ce_loss": 0.043029189109802246, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "loss": 0.29049375653266907, + "step": 28850 + }, + { + "ce_loss": 0.04398266226053238, + "epoch": 9.623082054703135, + "step": 28850 + }, + { + "distill_loss": 0.17389945685863495, + "epoch": 9.623082054703135, + "step": 28850 + }, + { + "epoch": 9.623082054703135, + "ref_ce_loss": 0.05363408103585243, + "step": 28850 + }, + { + "epoch": 9.62641761174116, + "loss": 0.336, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "grad_norm": 1.0026607513427734, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "learning_rate": 2.924469328441859e-06, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "loss": 0.4305479824542999, + "step": 28860 + }, + { + "ce_loss": 0.06890071928501129, + "epoch": 9.62641761174116, + "step": 28860 + }, + { + "distill_loss": 0.21621493995189667, + "epoch": 9.62641761174116, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "ref_ce_loss": 0.06498651951551437, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "loss": 0.253123939037323, + "step": 28860 + }, + { + "ce_loss": 0.03857715055346489, + "epoch": 9.62641761174116, + "step": 28860 + }, + { + "distill_loss": 0.1470891535282135, + "epoch": 9.62641761174116, + "step": 28860 + }, + { + "epoch": 9.62641761174116, + "ref_ce_loss": 0.053641896694898605, + "step": 28860 + }, + { + "epoch": 9.629753168779185, + "loss": 0.3424, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "grad_norm": 0.844653308391571, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "learning_rate": 2.8725421150157527e-06, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "loss": 0.3580155074596405, + "step": 28870 + }, + { + "ce_loss": 0.07331318408250809, + "epoch": 9.629753168779185, + "step": 28870 + }, + { + "distill_loss": 0.21698784828186035, + "epoch": 9.629753168779185, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "ref_ce_loss": 0.053149282932281494, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "loss": 0.3565460443496704, + "step": 28870 + }, + { + "ce_loss": 0.05684034526348114, + "epoch": 9.629753168779185, + "step": 28870 + }, + { + "distill_loss": 0.21384520828723907, + "epoch": 9.629753168779185, + "step": 28870 + }, + { + "epoch": 9.629753168779185, + "ref_ce_loss": 0.06192195042967796, + "step": 28870 + }, + { + "epoch": 9.633088725817212, + "loss": 0.3564, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "grad_norm": 1.2454036474227905, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "learning_rate": 2.8210783920656903e-06, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "loss": 0.5010125041007996, + "step": 28880 + }, + { + "ce_loss": 0.03849237039685249, + "epoch": 9.633088725817212, + "step": 28880 + }, + { + "distill_loss": 0.17564459145069122, + "epoch": 9.633088725817212, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "ref_ce_loss": 0.07166219502687454, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "loss": 0.338174968957901, + "step": 28880 + }, + { + "ce_loss": 0.0471452958881855, + "epoch": 9.633088725817212, + "step": 28880 + }, + { + "distill_loss": 0.16874736547470093, + "epoch": 9.633088725817212, + "step": 28880 + }, + { + "epoch": 9.633088725817212, + "ref_ce_loss": 0.09181161969900131, + "step": 28880 + }, + { + "epoch": 9.636424282855236, + "loss": 0.3748, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "grad_norm": 2.602931261062622, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "learning_rate": 2.7700782196553585e-06, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "loss": 0.36306819319725037, + "step": 28890 + }, + { + "ce_loss": 0.04997847229242325, + "epoch": 9.636424282855236, + "step": 28890 + }, + { + "distill_loss": 0.22075606882572174, + "epoch": 9.636424282855236, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "ref_ce_loss": 0.07122057676315308, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "loss": 0.4285609722137451, + "step": 28890 + }, + { + "ce_loss": 0.0449661985039711, + "epoch": 9.636424282855236, + "step": 28890 + }, + { + "distill_loss": 0.18159648776054382, + "epoch": 9.636424282855236, + "step": 28890 + }, + { + "epoch": 9.636424282855236, + "ref_ce_loss": 0.05567074567079544, + "step": 28890 + }, + { + "epoch": 9.639759839893262, + "loss": 0.3606, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "grad_norm": 1.0745766162872314, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "learning_rate": 2.7195416573074562e-06, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "loss": 0.28361329436302185, + "step": 28900 + }, + { + "ce_loss": 0.03859997168183327, + "epoch": 9.639759839893262, + "step": 28900 + }, + { + "distill_loss": 0.1544235497713089, + "epoch": 9.639759839893262, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "ref_ce_loss": 0.07481688261032104, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "loss": 0.5972962379455566, + "step": 28900 + }, + { + "ce_loss": 0.06487046927213669, + "epoch": 9.639759839893262, + "step": 28900 + }, + { + "distill_loss": 0.14648039638996124, + "epoch": 9.639759839893262, + "step": 28900 + }, + { + "epoch": 9.639759839893262, + "ref_ce_loss": 0.097092404961586, + "step": 28900 + }, + { + "epoch": 9.643095396931287, + "loss": 0.333, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "grad_norm": 0.9275277256965637, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "learning_rate": 2.6694687640036022e-06, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "loss": 0.3020519018173218, + "step": 28910 + }, + { + "ce_loss": 0.038193780928850174, + "epoch": 9.643095396931287, + "step": 28910 + }, + { + "distill_loss": 0.19235727190971375, + "epoch": 9.643095396931287, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "ref_ce_loss": 0.05098062381148338, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "loss": 0.2890506088733673, + "step": 28910 + }, + { + "ce_loss": 0.05658218637108803, + "epoch": 9.643095396931287, + "step": 28910 + }, + { + "distill_loss": 0.18204079568386078, + "epoch": 9.643095396931287, + "step": 28910 + }, + { + "epoch": 9.643095396931287, + "ref_ce_loss": 0.05018565058708191, + "step": 28910 + }, + { + "epoch": 9.646430953969313, + "loss": 0.3726, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "grad_norm": 1.0181881189346313, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "learning_rate": 2.6198595981842046e-06, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "loss": 0.3459164798259735, + "step": 28920 + }, + { + "ce_loss": 0.0480022206902504, + "epoch": 9.646430953969313, + "step": 28920 + }, + { + "distill_loss": 0.164689838886261, + "epoch": 9.646430953969313, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "ref_ce_loss": 0.046335261315107346, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "loss": 0.2523891031742096, + "step": 28920 + }, + { + "ce_loss": 0.028045305982232094, + "epoch": 9.646430953969313, + "step": 28920 + }, + { + "distill_loss": 0.16294798254966736, + "epoch": 9.646430953969313, + "step": 28920 + }, + { + "epoch": 9.646430953969313, + "ref_ce_loss": 0.06082697585225105, + "step": 28920 + }, + { + "epoch": 9.649766511007337, + "loss": 0.3963, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "grad_norm": 1.255349040031433, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "learning_rate": 2.570714217748549e-06, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "loss": 0.42087769508361816, + "step": 28930 + }, + { + "ce_loss": 0.0513894259929657, + "epoch": 9.649766511007337, + "step": 28930 + }, + { + "distill_loss": 0.17212828993797302, + "epoch": 9.649766511007337, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "ref_ce_loss": 0.0597614161670208, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "loss": 0.27796125411987305, + "step": 28930 + }, + { + "ce_loss": 0.04492616653442383, + "epoch": 9.649766511007337, + "step": 28930 + }, + { + "distill_loss": 0.17121589183807373, + "epoch": 9.649766511007337, + "step": 28930 + }, + { + "epoch": 9.649766511007337, + "ref_ce_loss": 0.04602546989917755, + "step": 28930 + }, + { + "epoch": 9.653102068045364, + "loss": 0.3608, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "grad_norm": 0.8002476096153259, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "learning_rate": 2.5220326800545313e-06, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "loss": 0.3676643967628479, + "step": 28940 + }, + { + "ce_loss": 0.04760293290019035, + "epoch": 9.653102068045364, + "step": 28940 + }, + { + "distill_loss": 0.20377562940120697, + "epoch": 9.653102068045364, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "ref_ce_loss": 0.0758914053440094, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "loss": 0.30955246090888977, + "step": 28940 + }, + { + "ce_loss": 0.035490572452545166, + "epoch": 9.653102068045364, + "step": 28940 + }, + { + "distill_loss": 0.18446147441864014, + "epoch": 9.653102068045364, + "step": 28940 + }, + { + "epoch": 9.653102068045364, + "ref_ce_loss": 0.06045921519398689, + "step": 28940 + }, + { + "epoch": 9.656437625083388, + "loss": 0.3389, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "grad_norm": 1.1092398166656494, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "learning_rate": 2.473815041918792e-06, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "loss": 0.23038047552108765, + "step": 28950 + }, + { + "ce_loss": 0.024803156033158302, + "epoch": 9.656437625083388, + "step": 28950 + }, + { + "distill_loss": 0.13957466185092926, + "epoch": 9.656437625083388, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "ref_ce_loss": 0.04500986263155937, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "loss": 0.31989532709121704, + "step": 28950 + }, + { + "ce_loss": 0.042344290763139725, + "epoch": 9.656437625083388, + "step": 28950 + }, + { + "distill_loss": 0.20331068336963654, + "epoch": 9.656437625083388, + "step": 28950 + }, + { + "epoch": 9.656437625083388, + "ref_ce_loss": 0.05306573957204819, + "step": 28950 + }, + { + "epoch": 9.659773182121414, + "loss": 0.3883, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "grad_norm": 0.9196416139602661, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "learning_rate": 2.426061359616494e-06, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "loss": 0.276298850774765, + "step": 28960 + }, + { + "ce_loss": 0.018789373338222504, + "epoch": 9.659773182121414, + "step": 28960 + }, + { + "distill_loss": 0.14118222892284393, + "epoch": 9.659773182121414, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "ref_ce_loss": 0.06556124240159988, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "loss": 0.9002341032028198, + "step": 28960 + }, + { + "ce_loss": 0.06075263395905495, + "epoch": 9.659773182121414, + "step": 28960 + }, + { + "distill_loss": 0.17979496717453003, + "epoch": 9.659773182121414, + "step": 28960 + }, + { + "epoch": 9.659773182121414, + "ref_ce_loss": 0.05195245519280434, + "step": 28960 + }, + { + "epoch": 9.663108739159439, + "loss": 0.3365, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "grad_norm": 1.0752198696136475, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "learning_rate": 2.3787716888813206e-06, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "loss": 0.43198779225349426, + "step": 28970 + }, + { + "ce_loss": 0.05001339688897133, + "epoch": 9.663108739159439, + "step": 28970 + }, + { + "distill_loss": 0.17285940051078796, + "epoch": 9.663108739159439, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "ref_ce_loss": 0.06730245053768158, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "loss": 0.41112181544303894, + "step": 28970 + }, + { + "ce_loss": 0.055711500346660614, + "epoch": 9.663108739159439, + "step": 28970 + }, + { + "distill_loss": 0.17232096195220947, + "epoch": 9.663108739159439, + "step": 28970 + }, + { + "epoch": 9.663108739159439, + "ref_ce_loss": 0.07165464013814926, + "step": 28970 + }, + { + "epoch": 9.666444296197465, + "loss": 0.3493, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "grad_norm": 1.0368164777755737, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "learning_rate": 2.3319460849053897e-06, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "loss": 0.29087287187576294, + "step": 28980 + }, + { + "ce_loss": 0.028599737212061882, + "epoch": 9.666444296197465, + "step": 28980 + }, + { + "distill_loss": 0.19898228347301483, + "epoch": 9.666444296197465, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "ref_ce_loss": 0.06319849193096161, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "loss": 0.23333315551280975, + "step": 28980 + }, + { + "ce_loss": 0.019216326996684074, + "epoch": 9.666444296197465, + "step": 28980 + }, + { + "distill_loss": 0.13311071693897247, + "epoch": 9.666444296197465, + "step": 28980 + }, + { + "epoch": 9.666444296197465, + "ref_ce_loss": 0.06234074756503105, + "step": 28980 + }, + { + "epoch": 9.66977985323549, + "loss": 0.3176, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "grad_norm": 0.9731513857841492, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "learning_rate": 2.2855846023392524e-06, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "loss": 0.39693683385849, + "step": 28990 + }, + { + "ce_loss": 0.022053180262446404, + "epoch": 9.66977985323549, + "step": 28990 + }, + { + "distill_loss": 0.17590022087097168, + "epoch": 9.66977985323549, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "ref_ce_loss": 0.05321211740374565, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "loss": 0.340067982673645, + "step": 28990 + }, + { + "ce_loss": 0.06889030337333679, + "epoch": 9.66977985323549, + "step": 28990 + }, + { + "distill_loss": 0.19582036137580872, + "epoch": 9.66977985323549, + "step": 28990 + }, + { + "epoch": 9.66977985323549, + "ref_ce_loss": 0.07497034221887589, + "step": 28990 + }, + { + "epoch": 9.673115410273516, + "loss": 0.3492, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "grad_norm": 1.4342679977416992, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "learning_rate": 2.239687295291715e-06, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "loss": 0.2772221267223358, + "step": 29000 + }, + { + "ce_loss": 0.017711373046040535, + "epoch": 9.673115410273516, + "step": 29000 + }, + { + "distill_loss": 0.1418878436088562, + "epoch": 9.673115410273516, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "ref_ce_loss": 0.05305256322026253, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "loss": 0.44195717573165894, + "step": 29000 + }, + { + "ce_loss": 0.05625804141163826, + "epoch": 9.673115410273516, + "step": 29000 + }, + { + "distill_loss": 0.17068497836589813, + "epoch": 9.673115410273516, + "step": 29000 + }, + { + "epoch": 9.673115410273516, + "ref_ce_loss": 0.06807989627122879, + "step": 29000 + }, + { + "epoch": 9.67645096731154, + "loss": 0.3439, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "grad_norm": 1.06513249874115, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "learning_rate": 2.194254217329883e-06, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "loss": 0.323006808757782, + "step": 29010 + }, + { + "ce_loss": 0.028976066038012505, + "epoch": 9.67645096731154, + "step": 29010 + }, + { + "distill_loss": 0.19334059953689575, + "epoch": 9.67645096731154, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "ref_ce_loss": 0.06735754758119583, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "loss": 0.257314532995224, + "step": 29010 + }, + { + "ce_loss": 0.02535068616271019, + "epoch": 9.67645096731154, + "step": 29010 + }, + { + "distill_loss": 0.1416252702474594, + "epoch": 9.67645096731154, + "step": 29010 + }, + { + "epoch": 9.67645096731154, + "ref_ce_loss": 0.06281197816133499, + "step": 29010 + }, + { + "epoch": 9.679786524349566, + "loss": 0.3346, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "grad_norm": 1.426174521446228, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "learning_rate": 2.1492854214790304e-06, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "loss": 0.28953003883361816, + "step": 29020 + }, + { + "ce_loss": 0.044721540063619614, + "epoch": 9.679786524349566, + "step": 29020 + }, + { + "distill_loss": 0.18095023930072784, + "epoch": 9.679786524349566, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "ref_ce_loss": 0.04519897699356079, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "loss": 0.4611284136772156, + "step": 29020 + }, + { + "ce_loss": 0.07004204392433167, + "epoch": 9.679786524349566, + "step": 29020 + }, + { + "distill_loss": 0.19099056720733643, + "epoch": 9.679786524349566, + "step": 29020 + }, + { + "epoch": 9.679786524349566, + "ref_ce_loss": 0.07121763378381729, + "step": 29020 + }, + { + "epoch": 9.683122081387591, + "loss": 0.3151, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "grad_norm": 0.7787420153617859, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "learning_rate": 2.1047809602225966e-06, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "loss": 0.283674031496048, + "step": 29030 + }, + { + "ce_loss": 0.03510609641671181, + "epoch": 9.683122081387591, + "step": 29030 + }, + { + "distill_loss": 0.164617657661438, + "epoch": 9.683122081387591, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "ref_ce_loss": 0.05867642164230347, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "loss": 0.39628174901008606, + "step": 29030 + }, + { + "ce_loss": 0.07344058156013489, + "epoch": 9.683122081387591, + "step": 29030 + }, + { + "distill_loss": 0.1842685043811798, + "epoch": 9.683122081387591, + "step": 29030 + }, + { + "epoch": 9.683122081387591, + "ref_ce_loss": 0.076644167304039, + "step": 29030 + }, + { + "epoch": 9.686457638425617, + "loss": 0.3965, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "grad_norm": 1.1861432790756226, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "learning_rate": 2.0607408855020995e-06, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "loss": 0.2955714166164398, + "step": 29040 + }, + { + "ce_loss": 0.03226979076862335, + "epoch": 9.686457638425617, + "step": 29040 + }, + { + "distill_loss": 0.1686120629310608, + "epoch": 9.686457638425617, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "ref_ce_loss": 0.07210496068000793, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "loss": 0.3625262975692749, + "step": 29040 + }, + { + "ce_loss": 0.05599134787917137, + "epoch": 9.686457638425617, + "step": 29040 + }, + { + "distill_loss": 0.20067884027957916, + "epoch": 9.686457638425617, + "step": 29040 + }, + { + "epoch": 9.686457638425617, + "ref_ce_loss": 0.046158257871866226, + "step": 29040 + }, + { + "epoch": 9.689793195463643, + "loss": 0.3506, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "grad_norm": 0.8858498930931091, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "learning_rate": 2.017165248717001e-06, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "loss": 0.2790871560573578, + "step": 29050 + }, + { + "ce_loss": 0.03252222016453743, + "epoch": 9.689793195463643, + "step": 29050 + }, + { + "distill_loss": 0.1491430401802063, + "epoch": 9.689793195463643, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "ref_ce_loss": 0.07085377722978592, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "loss": 0.2061745971441269, + "step": 29050 + }, + { + "ce_loss": 0.037018779665231705, + "epoch": 9.689793195463643, + "step": 29050 + }, + { + "distill_loss": 0.12827187776565552, + "epoch": 9.689793195463643, + "step": 29050 + }, + { + "epoch": 9.689793195463643, + "ref_ce_loss": 0.040482133626937866, + "step": 29050 + }, + { + "epoch": 9.693128752501668, + "loss": 0.3418, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "grad_norm": 1.0434974431991577, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "learning_rate": 1.9740541007247983e-06, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "loss": 0.32193756103515625, + "step": 29060 + }, + { + "ce_loss": 0.029255038127303123, + "epoch": 9.693128752501668, + "step": 29060 + }, + { + "distill_loss": 0.15391921997070312, + "epoch": 9.693128752501668, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "ref_ce_loss": 0.08128950744867325, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "loss": 0.23065324127674103, + "step": 29060 + }, + { + "ce_loss": 0.01794644072651863, + "epoch": 9.693128752501668, + "step": 29060 + }, + { + "distill_loss": 0.13754110038280487, + "epoch": 9.693128752501668, + "step": 29060 + }, + { + "epoch": 9.693128752501668, + "ref_ce_loss": 0.04723289608955383, + "step": 29060 + }, + { + "epoch": 9.696464309539692, + "loss": 0.3458, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "grad_norm": 1.0453249216079712, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "learning_rate": 1.931407491840842e-06, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "loss": 0.3305038511753082, + "step": 29070 + }, + { + "ce_loss": 0.0752817839384079, + "epoch": 9.696464309539692, + "step": 29070 + }, + { + "distill_loss": 0.16382677853107452, + "epoch": 9.696464309539692, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "ref_ce_loss": 0.07009885460138321, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "loss": 0.3689107894897461, + "step": 29070 + }, + { + "ce_loss": 0.04890412092208862, + "epoch": 9.696464309539692, + "step": 29070 + }, + { + "distill_loss": 0.17433670163154602, + "epoch": 9.696464309539692, + "step": 29070 + }, + { + "epoch": 9.696464309539692, + "ref_ce_loss": 0.08437875658273697, + "step": 29070 + }, + { + "epoch": 9.699799866577719, + "loss": 0.3394, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "grad_norm": 1.406949758529663, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "learning_rate": 1.8892254718382962e-06, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "loss": 0.4878078103065491, + "step": 29080 + }, + { + "ce_loss": 0.050842683762311935, + "epoch": 9.699799866577719, + "step": 29080 + }, + { + "distill_loss": 0.24324414134025574, + "epoch": 9.699799866577719, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "ref_ce_loss": 0.07579061388969421, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "loss": 0.3403063714504242, + "step": 29080 + }, + { + "ce_loss": 0.042193423956632614, + "epoch": 9.699799866577719, + "step": 29080 + }, + { + "distill_loss": 0.1843198984861374, + "epoch": 9.699799866577719, + "step": 29080 + }, + { + "epoch": 9.699799866577719, + "ref_ce_loss": 0.08377090841531754, + "step": 29080 + }, + { + "epoch": 9.703135423615745, + "loss": 0.3533, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "grad_norm": 0.9102470874786377, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "learning_rate": 1.8475080899480913e-06, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "loss": 0.7278520464897156, + "step": 29090 + }, + { + "ce_loss": 0.09050574898719788, + "epoch": 9.703135423615745, + "step": 29090 + }, + { + "distill_loss": 0.20973047614097595, + "epoch": 9.703135423615745, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "ref_ce_loss": 0.07229988276958466, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "loss": 0.36448508501052856, + "step": 29090 + }, + { + "ce_loss": 0.05310250073671341, + "epoch": 9.703135423615745, + "step": 29090 + }, + { + "distill_loss": 0.17795789241790771, + "epoch": 9.703135423615745, + "step": 29090 + }, + { + "epoch": 9.703135423615745, + "ref_ce_loss": 0.07669250667095184, + "step": 29090 + }, + { + "epoch": 9.70647098065377, + "loss": 0.4006, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "grad_norm": 1.8485174179077148, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "learning_rate": 1.8062553948589244e-06, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "loss": 0.33109721541404724, + "step": 29100 + }, + { + "ce_loss": 0.03415803238749504, + "epoch": 9.70647098065377, + "step": 29100 + }, + { + "distill_loss": 0.21031081676483154, + "epoch": 9.70647098065377, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "ref_ce_loss": 0.062195755541324615, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "loss": 0.2148629128932953, + "step": 29100 + }, + { + "ce_loss": 0.03016166016459465, + "epoch": 9.70647098065377, + "step": 29100 + }, + { + "distill_loss": 0.14262332022190094, + "epoch": 9.70647098065377, + "step": 29100 + }, + { + "epoch": 9.70647098065377, + "ref_ce_loss": 0.041959211230278015, + "step": 29100 + }, + { + "epoch": 9.709806537691794, + "loss": 0.3232, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "grad_norm": 1.0245168209075928, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "learning_rate": 1.7654674347171719e-06, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "loss": 0.3630022406578064, + "step": 29110 + }, + { + "ce_loss": 0.03858557716012001, + "epoch": 9.709806537691794, + "step": 29110 + }, + { + "distill_loss": 0.22540675103664398, + "epoch": 9.709806537691794, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "ref_ce_loss": 0.07783447206020355, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "loss": 0.28904521465301514, + "step": 29110 + }, + { + "ce_loss": 0.022058136761188507, + "epoch": 9.709806537691794, + "step": 29110 + }, + { + "distill_loss": 0.1771065890789032, + "epoch": 9.709806537691794, + "step": 29110 + }, + { + "epoch": 9.709806537691794, + "ref_ce_loss": 0.06527817249298096, + "step": 29110 + }, + { + "epoch": 9.71314209472982, + "loss": 0.3433, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "grad_norm": 0.9284103512763977, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "learning_rate": 1.7251442571267095e-06, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "loss": 0.37761133909225464, + "step": 29120 + }, + { + "ce_loss": 0.01938902959227562, + "epoch": 9.71314209472982, + "step": 29120 + }, + { + "distill_loss": 0.20532874763011932, + "epoch": 9.71314209472982, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "ref_ce_loss": 0.06604550033807755, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "loss": 0.37872540950775146, + "step": 29120 + }, + { + "ce_loss": 0.042316507548093796, + "epoch": 9.71314209472982, + "step": 29120 + }, + { + "distill_loss": 0.19641393423080444, + "epoch": 9.71314209472982, + "step": 29120 + }, + { + "epoch": 9.71314209472982, + "ref_ce_loss": 0.06613562256097794, + "step": 29120 + }, + { + "epoch": 9.716477651767846, + "loss": 0.3223, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "grad_norm": 0.7625808715820312, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "learning_rate": 1.685285909149048e-06, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "loss": 0.36743196845054626, + "step": 29130 + }, + { + "ce_loss": 0.03555121272802353, + "epoch": 9.716477651767846, + "step": 29130 + }, + { + "distill_loss": 0.15161210298538208, + "epoch": 9.716477651767846, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "ref_ce_loss": 0.07639976590871811, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "loss": 0.4357321262359619, + "step": 29130 + }, + { + "ce_loss": 0.04716123268008232, + "epoch": 9.716477651767846, + "step": 29130 + }, + { + "distill_loss": 0.2087934911251068, + "epoch": 9.716477651767846, + "step": 29130 + }, + { + "epoch": 9.716477651767846, + "ref_ce_loss": 0.05667942389845848, + "step": 29130 + }, + { + "epoch": 9.71981320880587, + "loss": 0.3243, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "grad_norm": 0.7821916937828064, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "learning_rate": 1.6458924373031537e-06, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "loss": 0.35661622881889343, + "step": 29140 + }, + { + "ce_loss": 0.03580138087272644, + "epoch": 9.71981320880587, + "step": 29140 + }, + { + "distill_loss": 0.193109393119812, + "epoch": 9.71981320880587, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "ref_ce_loss": 0.0483783595263958, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "loss": 0.32538557052612305, + "step": 29140 + }, + { + "ce_loss": 0.05694977939128876, + "epoch": 9.71981320880587, + "step": 29140 + }, + { + "distill_loss": 0.17848563194274902, + "epoch": 9.71981320880587, + "step": 29140 + }, + { + "epoch": 9.71981320880587, + "ref_ce_loss": 0.06146978586912155, + "step": 29140 + }, + { + "epoch": 9.723148765843895, + "loss": 0.3345, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "grad_norm": 0.824491560459137, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "learning_rate": 1.6069638875654491e-06, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "loss": 0.6099705100059509, + "step": 29150 + }, + { + "ce_loss": 0.058300528675317764, + "epoch": 9.723148765843895, + "step": 29150 + }, + { + "distill_loss": 0.16668078303337097, + "epoch": 9.723148765843895, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "ref_ce_loss": 0.08649856597185135, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "loss": 0.34289857745170593, + "step": 29150 + }, + { + "ce_loss": 0.042254604399204254, + "epoch": 9.723148765843895, + "step": 29150 + }, + { + "distill_loss": 0.16260452568531036, + "epoch": 9.723148765843895, + "step": 29150 + }, + { + "epoch": 9.723148765843895, + "ref_ce_loss": 0.05944216996431351, + "step": 29150 + }, + { + "epoch": 9.726484322881921, + "loss": 0.3579, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "grad_norm": 1.2022480964660645, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "learning_rate": 1.5685003053698134e-06, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "loss": 0.35259881615638733, + "step": 29160 + }, + { + "ce_loss": 0.044145677238702774, + "epoch": 9.726484322881921, + "step": 29160 + }, + { + "distill_loss": 0.16626057028770447, + "epoch": 9.726484322881921, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "ref_ce_loss": 0.05313951522111893, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "loss": 0.3314366340637207, + "step": 29160 + }, + { + "ce_loss": 0.04013233259320259, + "epoch": 9.726484322881921, + "step": 29160 + }, + { + "distill_loss": 0.19226589798927307, + "epoch": 9.726484322881921, + "step": 29160 + }, + { + "epoch": 9.726484322881921, + "ref_ce_loss": 0.06861289590597153, + "step": 29160 + }, + { + "epoch": 9.729819879919948, + "loss": 0.3212, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "grad_norm": 1.4517748355865479, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "learning_rate": 1.5305017356072704e-06, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "loss": 0.2889649569988251, + "step": 29170 + }, + { + "ce_loss": 0.054967913776636124, + "epoch": 9.729819879919948, + "step": 29170 + }, + { + "distill_loss": 0.17594516277313232, + "epoch": 9.729819879919948, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "ref_ce_loss": 0.05794268473982811, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "loss": 0.23234641551971436, + "step": 29170 + }, + { + "ce_loss": 0.04126816242933273, + "epoch": 9.729819879919948, + "step": 29170 + }, + { + "distill_loss": 0.13288410007953644, + "epoch": 9.729819879919948, + "step": 29170 + }, + { + "epoch": 9.729819879919948, + "ref_ce_loss": 0.04256091266870499, + "step": 29170 + }, + { + "epoch": 9.733155436957972, + "loss": 0.3142, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "grad_norm": 0.9493514895439148, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "learning_rate": 1.4929682226263009e-06, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "loss": 0.25031107664108276, + "step": 29180 + }, + { + "ce_loss": 0.016915347427129745, + "epoch": 9.733155436957972, + "step": 29180 + }, + { + "distill_loss": 0.16258159279823303, + "epoch": 9.733155436957972, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "ref_ce_loss": 0.042231179773807526, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "loss": 0.2727430760860443, + "step": 29180 + }, + { + "ce_loss": 0.03402730077505112, + "epoch": 9.733155436957972, + "step": 29180 + }, + { + "distill_loss": 0.17123806476593018, + "epoch": 9.733155436957972, + "step": 29180 + }, + { + "epoch": 9.733155436957972, + "ref_ce_loss": 0.06728982925415039, + "step": 29180 + }, + { + "epoch": 9.736490993995996, + "loss": 0.3432, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "grad_norm": 1.1605498790740967, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "learning_rate": 1.455899810232575e-06, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "loss": 0.29889416694641113, + "step": 29190 + }, + { + "ce_loss": 0.032608695328235626, + "epoch": 9.736490993995996, + "step": 29190 + }, + { + "distill_loss": 0.21592484414577484, + "epoch": 9.736490993995996, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "ref_ce_loss": 0.0500856414437294, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "loss": 0.3522995412349701, + "step": 29190 + }, + { + "ce_loss": 0.044759076088666916, + "epoch": 9.736490993995996, + "step": 29190 + }, + { + "distill_loss": 0.17451868951320648, + "epoch": 9.736490993995996, + "step": 29190 + }, + { + "epoch": 9.736490993995996, + "ref_ce_loss": 0.055287107825279236, + "step": 29190 + }, + { + "epoch": 9.739826551034023, + "loss": 0.3752, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "grad_norm": 1.1452367305755615, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "learning_rate": 1.4192965416888637e-06, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "loss": 0.24558773636817932, + "step": 29200 + }, + { + "ce_loss": 0.033938828855752945, + "epoch": 9.739826551034023, + "step": 29200 + }, + { + "distill_loss": 0.1440381407737732, + "epoch": 9.739826551034023, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "ref_ce_loss": 0.04648728296160698, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "loss": 0.37278178334236145, + "step": 29200 + }, + { + "ce_loss": 0.052839286625385284, + "epoch": 9.739826551034023, + "step": 29200 + }, + { + "distill_loss": 0.150975301861763, + "epoch": 9.739826551034023, + "step": 29200 + }, + { + "epoch": 9.739826551034023, + "ref_ce_loss": 0.07346734404563904, + "step": 29200 + }, + { + "epoch": 9.743162108072049, + "loss": 0.309, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "grad_norm": 1.1472444534301758, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "learning_rate": 1.3831584597151282e-06, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "loss": 0.2397063970565796, + "step": 29210 + }, + { + "ce_loss": 0.03823701664805412, + "epoch": 9.743162108072049, + "step": 29210 + }, + { + "distill_loss": 0.16035953164100647, + "epoch": 9.743162108072049, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "ref_ce_loss": 0.040390852838754654, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "loss": 0.2995006740093231, + "step": 29210 + }, + { + "ce_loss": 0.04192734509706497, + "epoch": 9.743162108072049, + "step": 29210 + }, + { + "distill_loss": 0.17175507545471191, + "epoch": 9.743162108072049, + "step": 29210 + }, + { + "epoch": 9.743162108072049, + "ref_ce_loss": 0.06877624988555908, + "step": 29210 + }, + { + "epoch": 9.746497665110073, + "loss": 0.3703, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "grad_norm": 1.6166616678237915, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "learning_rate": 1.3474856064884745e-06, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "loss": 0.309572696685791, + "step": 29220 + }, + { + "ce_loss": 0.028945567086338997, + "epoch": 9.746497665110073, + "step": 29220 + }, + { + "distill_loss": 0.1187836080789566, + "epoch": 9.746497665110073, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "ref_ce_loss": 0.04301934316754341, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "loss": 0.2944486439228058, + "step": 29220 + }, + { + "ce_loss": 0.036328088492155075, + "epoch": 9.746497665110073, + "step": 29220 + }, + { + "distill_loss": 0.19853277504444122, + "epoch": 9.746497665110073, + "step": 29220 + }, + { + "epoch": 9.746497665110073, + "ref_ce_loss": 0.059367354959249496, + "step": 29220 + }, + { + "epoch": 9.749833222148098, + "loss": 0.3794, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "grad_norm": 1.2834672927856445, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "learning_rate": 1.3122780236428433e-06, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "loss": 0.2742766737937927, + "step": 29230 + }, + { + "ce_loss": 0.030356809496879578, + "epoch": 9.749833222148098, + "step": 29230 + }, + { + "distill_loss": 0.15357893705368042, + "epoch": 9.749833222148098, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "ref_ce_loss": 0.06088166683912277, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "loss": 0.3923237919807434, + "step": 29230 + }, + { + "ce_loss": 0.029607567936182022, + "epoch": 9.749833222148098, + "step": 29230 + }, + { + "distill_loss": 0.19450455904006958, + "epoch": 9.749833222148098, + "step": 29230 + }, + { + "epoch": 9.749833222148098, + "ref_ce_loss": 0.06190716102719307, + "step": 29230 + }, + { + "epoch": 9.753168779186124, + "loss": 0.3846, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "grad_norm": 2.3320555686950684, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "learning_rate": 1.2775357522693653e-06, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "loss": 0.34208378195762634, + "step": 29240 + }, + { + "ce_loss": 0.051376692950725555, + "epoch": 9.753168779186124, + "step": 29240 + }, + { + "distill_loss": 0.2006101906299591, + "epoch": 9.753168779186124, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "ref_ce_loss": 0.06693777441978455, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "loss": 0.277177095413208, + "step": 29240 + }, + { + "ce_loss": 0.034729231148958206, + "epoch": 9.753168779186124, + "step": 29240 + }, + { + "distill_loss": 0.14436738193035126, + "epoch": 9.753168779186124, + "step": 29240 + }, + { + "epoch": 9.753168779186124, + "ref_ce_loss": 0.058638881891965866, + "step": 29240 + }, + { + "epoch": 9.75650433622415, + "loss": 0.3215, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "grad_norm": 0.9842942357063293, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "learning_rate": 1.243258832915961e-06, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "loss": 0.22968745231628418, + "step": 29250 + }, + { + "ce_loss": 0.024926142767071724, + "epoch": 9.75650433622415, + "step": 29250 + }, + { + "distill_loss": 0.1374337375164032, + "epoch": 9.75650433622415, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "ref_ce_loss": 0.04320191964507103, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "loss": 0.4632025361061096, + "step": 29250 + }, + { + "ce_loss": 0.08126223832368851, + "epoch": 9.75650433622415, + "step": 29250 + }, + { + "distill_loss": 0.21030139923095703, + "epoch": 9.75650433622415, + "step": 29250 + }, + { + "epoch": 9.75650433622415, + "ref_ce_loss": 0.0613284595310688, + "step": 29250 + }, + { + "epoch": 9.759839893262175, + "loss": 0.3143, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "grad_norm": 0.7296186685562134, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "learning_rate": 1.2094473055875188e-06, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "loss": 0.2869877517223358, + "step": 29260 + }, + { + "ce_loss": 0.05808643996715546, + "epoch": 9.759839893262175, + "step": 29260 + }, + { + "distill_loss": 0.164188951253891, + "epoch": 9.759839893262175, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "ref_ce_loss": 0.06443438678979874, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "loss": 0.41061264276504517, + "step": 29260 + }, + { + "ce_loss": 0.039066676050424576, + "epoch": 9.759839893262175, + "step": 29260 + }, + { + "distill_loss": 0.19832192361354828, + "epoch": 9.759839893262175, + "step": 29260 + }, + { + "epoch": 9.759839893262175, + "ref_ce_loss": 0.06585456430912018, + "step": 29260 + }, + { + "epoch": 9.7631754503002, + "loss": 0.3178, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "grad_norm": 0.836146354675293, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "learning_rate": 1.176101209745717e-06, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "loss": 0.40682804584503174, + "step": 29270 + }, + { + "ce_loss": 0.05214698240160942, + "epoch": 9.7631754503002, + "step": 29270 + }, + { + "distill_loss": 0.13995395600795746, + "epoch": 9.7631754503002, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "ref_ce_loss": 0.05854807794094086, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "loss": 0.3249422013759613, + "step": 29270 + }, + { + "ce_loss": 0.06329483538866043, + "epoch": 9.7631754503002, + "step": 29270 + }, + { + "distill_loss": 0.19355738162994385, + "epoch": 9.7631754503002, + "step": 29270 + }, + { + "epoch": 9.7631754503002, + "ref_ce_loss": 0.06793557852506638, + "step": 29270 + }, + { + "epoch": 9.766511007338226, + "loss": 0.3547, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "grad_norm": 1.3675034046173096, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "learning_rate": 1.1432205843090237e-06, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "loss": 0.35455620288848877, + "step": 29280 + }, + { + "ce_loss": 0.04943772777915001, + "epoch": 9.766511007338226, + "step": 29280 + }, + { + "distill_loss": 0.1921720802783966, + "epoch": 9.766511007338226, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "ref_ce_loss": 0.049962118268013, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "loss": 0.26242080330848694, + "step": 29280 + }, + { + "ce_loss": 0.029609311372041702, + "epoch": 9.766511007338226, + "step": 29280 + }, + { + "distill_loss": 0.16324296593666077, + "epoch": 9.766511007338226, + "step": 29280 + }, + { + "epoch": 9.766511007338226, + "ref_ce_loss": 0.0418899767100811, + "step": 29280 + }, + { + "epoch": 9.769846564376252, + "loss": 0.3091, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "grad_norm": 0.939579963684082, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "learning_rate": 1.1108054676526535e-06, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "loss": 0.31739896535873413, + "step": 29290 + }, + { + "ce_loss": 0.04685492068529129, + "epoch": 9.769846564376252, + "step": 29290 + }, + { + "distill_loss": 0.17787881195545197, + "epoch": 9.769846564376252, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "ref_ce_loss": 0.06475568562746048, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "loss": 0.29376351833343506, + "step": 29290 + }, + { + "ce_loss": 0.02153054066002369, + "epoch": 9.769846564376252, + "step": 29290 + }, + { + "distill_loss": 0.19466844201087952, + "epoch": 9.769846564376252, + "step": 29290 + }, + { + "epoch": 9.769846564376252, + "ref_ce_loss": 0.07677976787090302, + "step": 29290 + }, + { + "epoch": 9.773182121414276, + "loss": 0.3034, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "grad_norm": 0.784531831741333, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "learning_rate": 1.078855897608566e-06, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "loss": 0.37095391750335693, + "step": 29300 + }, + { + "ce_loss": 0.06039052456617355, + "epoch": 9.773182121414276, + "step": 29300 + }, + { + "distill_loss": 0.19763991236686707, + "epoch": 9.773182121414276, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "ref_ce_loss": 0.0946519672870636, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "loss": 0.40771812200546265, + "step": 29300 + }, + { + "ce_loss": 0.027861036360263824, + "epoch": 9.773182121414276, + "step": 29300 + }, + { + "distill_loss": 0.18898458778858185, + "epoch": 9.773182121414276, + "step": 29300 + }, + { + "epoch": 9.773182121414276, + "ref_ce_loss": 0.0597851425409317, + "step": 29300 + }, + { + "epoch": 9.7765176784523, + "loss": 0.3896, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "grad_norm": 0.9433816075325012, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "learning_rate": 1.0473719114653336e-06, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "loss": 0.36913833022117615, + "step": 29310 + }, + { + "ce_loss": 0.03785061836242676, + "epoch": 9.7765176784523, + "step": 29310 + }, + { + "distill_loss": 0.18730568885803223, + "epoch": 9.7765176784523, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "ref_ce_loss": 0.05920045077800751, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "loss": 0.3134559094905853, + "step": 29310 + }, + { + "ce_loss": 0.0518287718296051, + "epoch": 9.7765176784523, + "step": 29310 + }, + { + "distill_loss": 0.16476958990097046, + "epoch": 9.7765176784523, + "step": 29310 + }, + { + "epoch": 9.7765176784523, + "ref_ce_loss": 0.04951930791139603, + "step": 29310 + }, + { + "epoch": 9.779853235490327, + "loss": 0.3495, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "grad_norm": 1.077572226524353, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "learning_rate": 1.016353545968185e-06, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "loss": 0.20148417353630066, + "step": 29320 + }, + { + "ce_loss": 0.01355352159589529, + "epoch": 9.779853235490327, + "step": 29320 + }, + { + "distill_loss": 0.12288177013397217, + "epoch": 9.779853235490327, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "ref_ce_loss": 0.03917378932237625, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "loss": 0.32212889194488525, + "step": 29320 + }, + { + "ce_loss": 0.05061560869216919, + "epoch": 9.779853235490327, + "step": 29320 + }, + { + "distill_loss": 0.17453105747699738, + "epoch": 9.779853235490327, + "step": 29320 + }, + { + "epoch": 9.779853235490327, + "ref_ce_loss": 0.06870347261428833, + "step": 29320 + }, + { + "epoch": 9.783188792528353, + "loss": 0.3211, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "grad_norm": 1.481539011001587, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "learning_rate": 9.858008373188288e-07, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "loss": 0.2580861747264862, + "step": 29330 + }, + { + "ce_loss": 0.034626785665750504, + "epoch": 9.783188792528353, + "step": 29330 + }, + { + "distill_loss": 0.13589392602443695, + "epoch": 9.783188792528353, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "ref_ce_loss": 0.06425262242555618, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "loss": 0.28824493288993835, + "step": 29330 + }, + { + "ce_loss": 0.03179427236318588, + "epoch": 9.783188792528353, + "step": 29330 + }, + { + "distill_loss": 0.16806454956531525, + "epoch": 9.783188792528353, + "step": 29330 + }, + { + "epoch": 9.783188792528353, + "ref_ce_loss": 0.06407367438077927, + "step": 29330 + }, + { + "epoch": 9.786524349566378, + "loss": 0.3656, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "grad_norm": 1.1869555711746216, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "learning_rate": 9.5571382117563e-07, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "loss": 0.49091100692749023, + "step": 29340 + }, + { + "ce_loss": 0.032899271696805954, + "epoch": 9.786524349566378, + "step": 29340 + }, + { + "distill_loss": 0.1702505201101303, + "epoch": 9.786524349566378, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "ref_ce_loss": 0.06909888237714767, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "loss": 0.31385013461112976, + "step": 29340 + }, + { + "ce_loss": 0.04734751954674721, + "epoch": 9.786524349566378, + "step": 29340 + }, + { + "distill_loss": 0.17034968733787537, + "epoch": 9.786524349566378, + "step": 29340 + }, + { + "epoch": 9.786524349566378, + "ref_ce_loss": 0.07363146543502808, + "step": 29340 + }, + { + "epoch": 9.789859906604402, + "loss": 0.3686, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "grad_norm": 0.7241499423980713, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "learning_rate": 9.260925326533443e-07, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "loss": 0.4805721342563629, + "step": 29350 + }, + { + "ce_loss": 0.02742915414273739, + "epoch": 9.789859906604402, + "step": 29350 + }, + { + "distill_loss": 0.2021547257900238, + "epoch": 9.789859906604402, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "ref_ce_loss": 0.05579015612602234, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "loss": 0.39362767338752747, + "step": 29350 + }, + { + "ce_loss": 0.08456600457429886, + "epoch": 9.789859906604402, + "step": 29350 + }, + { + "distill_loss": 0.2172761857509613, + "epoch": 9.789859906604402, + "step": 29350 + }, + { + "epoch": 9.789859906604402, + "ref_ce_loss": 0.0690891370177269, + "step": 29350 + }, + { + "epoch": 9.793195463642428, + "loss": 0.3328, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "grad_norm": 1.407702922821045, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "learning_rate": 8.969370063231619e-07, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "loss": 0.4750968813896179, + "step": 29360 + }, + { + "ce_loss": 0.08430177718400955, + "epoch": 9.793195463642428, + "step": 29360 + }, + { + "distill_loss": 0.17226916551589966, + "epoch": 9.793195463642428, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "ref_ce_loss": 0.08060526847839355, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "loss": 0.3968356251716614, + "step": 29360 + }, + { + "ce_loss": 0.05416855216026306, + "epoch": 9.793195463642428, + "step": 29360 + }, + { + "distill_loss": 0.16487491130828857, + "epoch": 9.793195463642428, + "step": 29360 + }, + { + "epoch": 9.793195463642428, + "ref_ce_loss": 0.04678542539477348, + "step": 29360 + }, + { + "epoch": 9.796531020680455, + "loss": 0.3292, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "grad_norm": 1.0663679838180542, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "learning_rate": 8.682472762127969e-07, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "loss": 0.4573275148868561, + "step": 29370 + }, + { + "ce_loss": 0.03016098216176033, + "epoch": 9.796531020680455, + "step": 29370 + }, + { + "distill_loss": 0.20741283893585205, + "epoch": 9.796531020680455, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "ref_ce_loss": 0.055127277970314026, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "loss": 0.35826027393341064, + "step": 29370 + }, + { + "ce_loss": 0.026030803099274635, + "epoch": 9.796531020680455, + "step": 29370 + }, + { + "distill_loss": 0.17437167465686798, + "epoch": 9.796531020680455, + "step": 29370 + }, + { + "epoch": 9.796531020680455, + "ref_ce_loss": 0.06781844049692154, + "step": 29370 + }, + { + "epoch": 9.799866577718479, + "loss": 0.3586, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "grad_norm": 1.5492557287216187, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "learning_rate": 8.400233758062203e-07, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "loss": 0.3331243097782135, + "step": 29380 + }, + { + "ce_loss": 0.02931608445942402, + "epoch": 9.799866577718479, + "step": 29380 + }, + { + "distill_loss": 0.19186915457248688, + "epoch": 9.799866577718479, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "ref_ce_loss": 0.08439023792743683, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "loss": 0.44308149814605713, + "step": 29380 + }, + { + "ce_loss": 0.05596522241830826, + "epoch": 9.799866577718479, + "step": 29380 + }, + { + "distill_loss": 0.20761224627494812, + "epoch": 9.799866577718479, + "step": 29380 + }, + { + "epoch": 9.799866577718479, + "ref_ce_loss": 0.07270487397909164, + "step": 29380 + }, + { + "epoch": 9.803202134756503, + "loss": 0.356, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "grad_norm": 1.0058664083480835, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "learning_rate": 8.122653380437494e-07, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "loss": 0.4317482113838196, + "step": 29390 + }, + { + "ce_loss": 0.09004373848438263, + "epoch": 9.803202134756503, + "step": 29390 + }, + { + "distill_loss": 0.22809693217277527, + "epoch": 9.803202134756503, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "ref_ce_loss": 0.07940634340047836, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "loss": 0.3935472071170807, + "step": 29390 + }, + { + "ce_loss": 0.031198523938655853, + "epoch": 9.803202134756503, + "step": 29390 + }, + { + "distill_loss": 0.1570068597793579, + "epoch": 9.803202134756503, + "step": 29390 + }, + { + "epoch": 9.803202134756503, + "ref_ce_loss": 0.06303267925977707, + "step": 29390 + }, + { + "epoch": 9.80653769179453, + "loss": 0.3451, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "grad_norm": 1.0701510906219482, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "learning_rate": 7.849731953219586e-07, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "loss": 0.4929094612598419, + "step": 29400 + }, + { + "ce_loss": 0.06670527160167694, + "epoch": 9.80653769179453, + "step": 29400 + }, + { + "distill_loss": 0.21560506522655487, + "epoch": 9.80653769179453, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "ref_ce_loss": 0.0714222639799118, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "loss": 0.5444076657295227, + "step": 29400 + }, + { + "ce_loss": 0.05342886596918106, + "epoch": 9.80653769179453, + "step": 29400 + }, + { + "distill_loss": 0.2137109786272049, + "epoch": 9.80653769179453, + "step": 29400 + }, + { + "epoch": 9.80653769179453, + "ref_ce_loss": 0.07326829433441162, + "step": 29400 + }, + { + "epoch": 9.809873248832556, + "loss": 0.3673, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "grad_norm": 1.2126073837280273, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "learning_rate": 7.581469794938123e-07, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "loss": 0.24275337159633636, + "step": 29410 + }, + { + "ce_loss": 0.02250901237130165, + "epoch": 9.809873248832556, + "step": 29410 + }, + { + "distill_loss": 0.11409185826778412, + "epoch": 9.809873248832556, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "ref_ce_loss": 0.054690055549144745, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "loss": 0.28774747252464294, + "step": 29410 + }, + { + "ce_loss": 0.033412329852581024, + "epoch": 9.809873248832556, + "step": 29410 + }, + { + "distill_loss": 0.205092191696167, + "epoch": 9.809873248832556, + "step": 29410 + }, + { + "epoch": 9.809873248832556, + "ref_ce_loss": 0.04910941794514656, + "step": 29410 + }, + { + "epoch": 9.81320880587058, + "loss": 0.2915, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "grad_norm": 1.0646889209747314, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "learning_rate": 7.317867218683549e-07, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "loss": 0.3459312319755554, + "step": 29420 + }, + { + "ce_loss": 0.05426276847720146, + "epoch": 9.81320880587058, + "step": 29420 + }, + { + "distill_loss": 0.17972438037395477, + "epoch": 9.81320880587058, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "ref_ce_loss": 0.04935836419463158, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "loss": 0.4190525412559509, + "step": 29420 + }, + { + "ce_loss": 0.023804601281881332, + "epoch": 9.81320880587058, + "step": 29420 + }, + { + "distill_loss": 0.1546304076910019, + "epoch": 9.81320880587058, + "step": 29420 + }, + { + "epoch": 9.81320880587058, + "ref_ce_loss": 0.07818574458360672, + "step": 29420 + }, + { + "epoch": 9.816544362908605, + "loss": 0.3582, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "grad_norm": 1.1533007621765137, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "learning_rate": 7.058924532107991e-07, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "loss": 0.33537954092025757, + "step": 29430 + }, + { + "ce_loss": 0.04029664024710655, + "epoch": 9.816544362908605, + "step": 29430 + }, + { + "distill_loss": 0.1852310448884964, + "epoch": 9.816544362908605, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "ref_ce_loss": 0.07313943654298782, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "loss": 0.22467778623104095, + "step": 29430 + }, + { + "ce_loss": 0.02593729831278324, + "epoch": 9.816544362908605, + "step": 29430 + }, + { + "distill_loss": 0.12661072611808777, + "epoch": 9.816544362908605, + "step": 29430 + }, + { + "epoch": 9.816544362908605, + "ref_ce_loss": 0.049755774438381195, + "step": 29430 + }, + { + "epoch": 9.819879919946631, + "loss": 0.3504, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "grad_norm": 1.1615389585494995, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "learning_rate": 6.804642037425701e-07, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "loss": 0.4558985233306885, + "step": 29440 + }, + { + "ce_loss": 0.023333707824349403, + "epoch": 9.819879919946631, + "step": 29440 + }, + { + "distill_loss": 0.177892804145813, + "epoch": 9.819879919946631, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "ref_ce_loss": 0.06867125630378723, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "loss": 0.33130738139152527, + "step": 29440 + }, + { + "ce_loss": 0.023025374859571457, + "epoch": 9.819879919946631, + "step": 29440 + }, + { + "distill_loss": 0.16406647861003876, + "epoch": 9.819879919946631, + "step": 29440 + }, + { + "epoch": 9.819879919946631, + "ref_ce_loss": 0.06003274768590927, + "step": 29440 + }, + { + "epoch": 9.823215476984657, + "loss": 0.3357, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "grad_norm": 1.0378285646438599, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "learning_rate": 6.555020031412173e-07, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "loss": 0.26800641417503357, + "step": 29450 + }, + { + "ce_loss": 0.015543187037110329, + "epoch": 9.823215476984657, + "step": 29450 + }, + { + "distill_loss": 0.15196271240711212, + "epoch": 9.823215476984657, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "ref_ce_loss": 0.04392324015498161, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "loss": 0.37747907638549805, + "step": 29450 + }, + { + "ce_loss": 0.057008299976587296, + "epoch": 9.823215476984657, + "step": 29450 + }, + { + "distill_loss": 0.20629453659057617, + "epoch": 9.823215476984657, + "step": 29450 + }, + { + "epoch": 9.823215476984657, + "ref_ce_loss": 0.05720047280192375, + "step": 29450 + }, + { + "epoch": 9.826551034022682, + "loss": 0.3457, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "grad_norm": 1.5450215339660645, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "learning_rate": 6.310058805402364e-07, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "loss": 0.1829114556312561, + "step": 29460 + }, + { + "ce_loss": 0.016622690483927727, + "epoch": 9.826551034022682, + "step": 29460 + }, + { + "distill_loss": 0.12692323327064514, + "epoch": 9.826551034022682, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "ref_ce_loss": 0.039235811680555344, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "loss": 0.23720866441726685, + "step": 29460 + }, + { + "ce_loss": 0.0072109573520720005, + "epoch": 9.826551034022682, + "step": 29460 + }, + { + "distill_loss": 0.1425161361694336, + "epoch": 9.826551034022682, + "step": 29460 + }, + { + "epoch": 9.826551034022682, + "ref_ce_loss": 0.03943220153450966, + "step": 29460 + }, + { + "epoch": 9.829886591060706, + "loss": 0.3084, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "grad_norm": 1.1493730545043945, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "learning_rate": 6.069758645292911e-07, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "loss": 0.2494208961725235, + "step": 29470 + }, + { + "ce_loss": 0.032901324331760406, + "epoch": 9.829886591060706, + "step": 29470 + }, + { + "distill_loss": 0.15291579067707062, + "epoch": 9.829886591060706, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "ref_ce_loss": 0.04548323526978493, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "loss": 0.3046186566352844, + "step": 29470 + }, + { + "ce_loss": 0.03033779375255108, + "epoch": 9.829886591060706, + "step": 29470 + }, + { + "distill_loss": 0.1353941112756729, + "epoch": 9.829886591060706, + "step": 29470 + }, + { + "epoch": 9.829886591060706, + "ref_ce_loss": 0.05030620098114014, + "step": 29470 + }, + { + "epoch": 9.833222148098733, + "loss": 0.3507, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "grad_norm": 1.4322553873062134, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "learning_rate": 5.834119831539476e-07, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "loss": 0.2620355486869812, + "step": 29480 + }, + { + "ce_loss": 0.03061460703611374, + "epoch": 9.833222148098733, + "step": 29480 + }, + { + "distill_loss": 0.15820306539535522, + "epoch": 9.833222148098733, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "ref_ce_loss": 0.07300352305173874, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "loss": 0.37205538153648376, + "step": 29480 + }, + { + "ce_loss": 0.06081470847129822, + "epoch": 9.833222148098733, + "step": 29480 + }, + { + "distill_loss": 0.19428209960460663, + "epoch": 9.833222148098733, + "step": 29480 + }, + { + "epoch": 9.833222148098733, + "ref_ce_loss": 0.0742083266377449, + "step": 29480 + }, + { + "epoch": 9.836557705136759, + "loss": 0.33, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "grad_norm": 0.9687492251396179, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "learning_rate": 5.603142639158509e-07, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "loss": 0.3235549032688141, + "step": 29490 + }, + { + "ce_loss": 0.07004064321517944, + "epoch": 9.836557705136759, + "step": 29490 + }, + { + "distill_loss": 0.17677612602710724, + "epoch": 9.836557705136759, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "ref_ce_loss": 0.05695893242955208, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "loss": 0.3133566081523895, + "step": 29490 + }, + { + "ce_loss": 0.026781374588608742, + "epoch": 9.836557705136759, + "step": 29490 + }, + { + "distill_loss": 0.15334056317806244, + "epoch": 9.836557705136759, + "step": 29490 + }, + { + "epoch": 9.836557705136759, + "ref_ce_loss": 0.06433262676000595, + "step": 29490 + }, + { + "epoch": 9.839893262174783, + "loss": 0.345, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "grad_norm": 2.4552054405212402, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "learning_rate": 5.376827337725043e-07, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "loss": 0.2624857723712921, + "step": 29500 + }, + { + "ce_loss": 0.03923223912715912, + "epoch": 9.839893262174783, + "step": 29500 + }, + { + "distill_loss": 0.13083603978157043, + "epoch": 9.839893262174783, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "ref_ce_loss": 0.07049775868654251, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "loss": 0.2594766914844513, + "step": 29500 + }, + { + "ce_loss": 0.02417309582233429, + "epoch": 9.839893262174783, + "step": 29500 + }, + { + "distill_loss": 0.1470203399658203, + "epoch": 9.839893262174783, + "step": 29500 + }, + { + "epoch": 9.839893262174783, + "ref_ce_loss": 0.053291480988264084, + "step": 29500 + }, + { + "epoch": 9.843228819212808, + "loss": 0.3229, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "grad_norm": 0.9992929100990295, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "learning_rate": 5.155174191373125e-07, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "loss": 0.37495696544647217, + "step": 29510 + }, + { + "ce_loss": 0.03893393650650978, + "epoch": 9.843228819212808, + "step": 29510 + }, + { + "distill_loss": 0.20713326334953308, + "epoch": 9.843228819212808, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "ref_ce_loss": 0.0703553631901741, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "loss": 0.2708454728126526, + "step": 29510 + }, + { + "ce_loss": 0.018474353477358818, + "epoch": 9.843228819212808, + "step": 29510 + }, + { + "distill_loss": 0.14013908803462982, + "epoch": 9.843228819212808, + "step": 29510 + }, + { + "epoch": 9.843228819212808, + "ref_ce_loss": 0.049369096755981445, + "step": 29510 + }, + { + "epoch": 9.846564376250834, + "loss": 0.3381, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "grad_norm": 1.5160677433013916, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "learning_rate": 4.938183458796263e-07, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "loss": 0.3293970227241516, + "step": 29520 + }, + { + "ce_loss": 0.06448846310377121, + "epoch": 9.846564376250834, + "step": 29520 + }, + { + "distill_loss": 0.17817185819149017, + "epoch": 9.846564376250834, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "ref_ce_loss": 0.0654769167304039, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "loss": 0.2610923945903778, + "step": 29520 + }, + { + "ce_loss": 0.028385812416672707, + "epoch": 9.846564376250834, + "step": 29520 + }, + { + "distill_loss": 0.12585677206516266, + "epoch": 9.846564376250834, + "step": 29520 + }, + { + "epoch": 9.846564376250834, + "ref_ce_loss": 0.05476205796003342, + "step": 29520 + }, + { + "epoch": 9.84989993328886, + "loss": 0.3552, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "grad_norm": 1.071336030960083, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "learning_rate": 4.7258553932456597e-07, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "loss": 0.3657957911491394, + "step": 29530 + }, + { + "ce_loss": 0.05615170672535896, + "epoch": 9.84989993328886, + "step": 29530 + }, + { + "distill_loss": 0.2162410318851471, + "epoch": 9.84989993328886, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "ref_ce_loss": 0.07879635691642761, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "loss": 0.34869688749313354, + "step": 29530 + }, + { + "ce_loss": 0.017612233757972717, + "epoch": 9.84989993328886, + "step": 29530 + }, + { + "distill_loss": 0.1881450116634369, + "epoch": 9.84989993328886, + "step": 29530 + }, + { + "epoch": 9.84989993328886, + "ref_ce_loss": 0.06109461188316345, + "step": 29530 + }, + { + "epoch": 9.853235490326885, + "loss": 0.3863, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "grad_norm": 1.999192237854004, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "learning_rate": 4.518190242531084e-07, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "loss": 0.30932730436325073, + "step": 29540 + }, + { + "ce_loss": 0.03714052587747574, + "epoch": 9.853235490326885, + "step": 29540 + }, + { + "distill_loss": 0.15744534134864807, + "epoch": 9.853235490326885, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "ref_ce_loss": 0.047130778431892395, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "loss": 0.4077582359313965, + "step": 29540 + }, + { + "ce_loss": 0.05509041249752045, + "epoch": 9.853235490326885, + "step": 29540 + }, + { + "distill_loss": 0.18503007292747498, + "epoch": 9.853235490326885, + "step": 29540 + }, + { + "epoch": 9.853235490326885, + "ref_ce_loss": 0.0741603821516037, + "step": 29540 + }, + { + "epoch": 9.856571047364909, + "loss": 0.3228, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "grad_norm": 1.1742290258407593, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "learning_rate": 4.315188249019997e-07, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "loss": 0.2508600950241089, + "step": 29550 + }, + { + "ce_loss": 0.031531572341918945, + "epoch": 9.856571047364909, + "step": 29550 + }, + { + "distill_loss": 0.1460273116827011, + "epoch": 9.856571047364909, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "ref_ce_loss": 0.05061808601021767, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "loss": 0.34020158648490906, + "step": 29550 + }, + { + "ce_loss": 0.042917292565107346, + "epoch": 9.856571047364909, + "step": 29550 + }, + { + "distill_loss": 0.18208904564380646, + "epoch": 9.856571047364909, + "step": 29550 + }, + { + "epoch": 9.856571047364909, + "ref_ce_loss": 0.07280907779932022, + "step": 29550 + }, + { + "epoch": 9.859906604402935, + "loss": 0.3637, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "grad_norm": 1.4106426239013672, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "learning_rate": 4.116849649637544e-07, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "loss": 0.27583619952201843, + "step": 29560 + }, + { + "ce_loss": 0.019909940659999847, + "epoch": 9.859906604402935, + "step": 29560 + }, + { + "distill_loss": 0.1528017371892929, + "epoch": 9.859906604402935, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "ref_ce_loss": 0.04741690680384636, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "loss": 0.39467301964759827, + "step": 29560 + }, + { + "ce_loss": 0.05224955081939697, + "epoch": 9.859906604402935, + "step": 29560 + }, + { + "distill_loss": 0.21950089931488037, + "epoch": 9.859906604402935, + "step": 29560 + }, + { + "epoch": 9.859906604402935, + "ref_ce_loss": 0.0884372740983963, + "step": 29560 + }, + { + "epoch": 9.863242161440962, + "loss": 0.3691, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "grad_norm": 1.08582603931427, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "learning_rate": 3.923174675866559e-07, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "loss": 0.297434002161026, + "step": 29570 + }, + { + "ce_loss": 0.018246673047542572, + "epoch": 9.863242161440962, + "step": 29570 + }, + { + "distill_loss": 0.16588199138641357, + "epoch": 9.863242161440962, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "ref_ce_loss": 0.05477264150977135, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "loss": 0.19365330040454865, + "step": 29570 + }, + { + "ce_loss": 0.0199178084731102, + "epoch": 9.863242161440962, + "step": 29570 + }, + { + "distill_loss": 0.1329023540019989, + "epoch": 9.863242161440962, + "step": 29570 + }, + { + "epoch": 9.863242161440962, + "ref_ce_loss": 0.040626198053359985, + "step": 29570 + }, + { + "epoch": 9.866577718478986, + "loss": 0.3131, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "grad_norm": 1.564894199371338, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "learning_rate": 3.734163553746672e-07, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "loss": 0.3847261667251587, + "step": 29580 + }, + { + "ce_loss": 0.028989629819989204, + "epoch": 9.866577718478986, + "step": 29580 + }, + { + "distill_loss": 0.1808268129825592, + "epoch": 9.866577718478986, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "ref_ce_loss": 0.06549061834812164, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "loss": 0.3929581046104431, + "step": 29580 + }, + { + "ce_loss": 0.037682678550481796, + "epoch": 9.866577718478986, + "step": 29580 + }, + { + "distill_loss": 0.21550612151622772, + "epoch": 9.866577718478986, + "step": 29580 + }, + { + "epoch": 9.866577718478986, + "ref_ce_loss": 0.07935473322868347, + "step": 29580 + }, + { + "epoch": 9.86991327551701, + "loss": 0.3386, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "grad_norm": 1.890181064605713, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "learning_rate": 3.5498165038734263e-07, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "loss": 0.31606292724609375, + "step": 29590 + }, + { + "ce_loss": 0.06806986778974533, + "epoch": 9.86991327551701, + "step": 29590 + }, + { + "distill_loss": 0.18640464544296265, + "epoch": 9.86991327551701, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "ref_ce_loss": 0.061429791152477264, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "loss": 0.31619423627853394, + "step": 29590 + }, + { + "ce_loss": 0.0290507934987545, + "epoch": 9.86991327551701, + "step": 29590 + }, + { + "distill_loss": 0.1894863247871399, + "epoch": 9.86991327551701, + "step": 29590 + }, + { + "epoch": 9.86991327551701, + "ref_ce_loss": 0.08042974025011063, + "step": 29590 + }, + { + "epoch": 9.873248832555037, + "loss": 0.3569, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "grad_norm": 1.0221617221832275, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "learning_rate": 3.370133741400494e-07, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "loss": 0.25058314204216003, + "step": 29600 + }, + { + "ce_loss": 0.011658146977424622, + "epoch": 9.873248832555037, + "step": 29600 + }, + { + "distill_loss": 0.15176154673099518, + "epoch": 9.873248832555037, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "ref_ce_loss": 0.062388718128204346, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "loss": 0.3523558974266052, + "step": 29600 + }, + { + "ce_loss": 0.056740108877420425, + "epoch": 9.873248832555037, + "step": 29600 + }, + { + "distill_loss": 0.1592649668455124, + "epoch": 9.873248832555037, + "step": 29600 + }, + { + "epoch": 9.873248832555037, + "ref_ce_loss": 0.0709981843829155, + "step": 29600 + }, + { + "epoch": 9.876584389593063, + "loss": 0.3249, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "grad_norm": 0.8258252143859863, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "learning_rate": 3.1951154760365696e-07, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "loss": 0.35080069303512573, + "step": 29610 + }, + { + "ce_loss": 0.03846856951713562, + "epoch": 9.876584389593063, + "step": 29610 + }, + { + "distill_loss": 0.17402790486812592, + "epoch": 9.876584389593063, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "ref_ce_loss": 0.06167008355259895, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "loss": 0.31475287675857544, + "step": 29610 + }, + { + "ce_loss": 0.03450953960418701, + "epoch": 9.876584389593063, + "step": 29610 + }, + { + "distill_loss": 0.19979970157146454, + "epoch": 9.876584389593063, + "step": 29610 + }, + { + "epoch": 9.876584389593063, + "ref_ce_loss": 0.05786199867725372, + "step": 29610 + }, + { + "epoch": 9.879919946631087, + "loss": 0.3241, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "grad_norm": 0.9958456754684448, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "learning_rate": 3.024761912046703e-07, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "loss": 0.3097935914993286, + "step": 29620 + }, + { + "ce_loss": 0.035024844110012054, + "epoch": 9.879919946631087, + "step": 29620 + }, + { + "distill_loss": 0.19233094155788422, + "epoch": 9.879919946631087, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "ref_ce_loss": 0.0557713583111763, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "loss": 0.2509743869304657, + "step": 29620 + }, + { + "ce_loss": 0.041433319449424744, + "epoch": 9.879919946631087, + "step": 29620 + }, + { + "distill_loss": 0.14673636853694916, + "epoch": 9.879919946631087, + "step": 29620 + }, + { + "epoch": 9.879919946631087, + "ref_ce_loss": 0.05089586228132248, + "step": 29620 + }, + { + "epoch": 9.883255503669112, + "loss": 0.3602, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "grad_norm": 1.3219614028930664, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "learning_rate": 2.8590732482522977e-07, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "loss": 0.29133516550064087, + "step": 29630 + }, + { + "ce_loss": 0.054918576031923294, + "epoch": 9.883255503669112, + "step": 29630 + }, + { + "distill_loss": 0.16251620650291443, + "epoch": 9.883255503669112, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "ref_ce_loss": 0.05042388662695885, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "loss": 0.27201393246650696, + "step": 29630 + }, + { + "ce_loss": 0.04357181861996651, + "epoch": 9.883255503669112, + "step": 29630 + }, + { + "distill_loss": 0.14555370807647705, + "epoch": 9.883255503669112, + "step": 29630 + }, + { + "epoch": 9.883255503669112, + "ref_ce_loss": 0.05804718658328056, + "step": 29630 + }, + { + "epoch": 9.886591060707138, + "loss": 0.3212, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "grad_norm": 0.9410602450370789, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "learning_rate": 2.698049678029335e-07, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "loss": 0.3896907567977905, + "step": 29640 + }, + { + "ce_loss": 0.027953682467341423, + "epoch": 9.886591060707138, + "step": 29640 + }, + { + "distill_loss": 0.23215749859809875, + "epoch": 9.886591060707138, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "ref_ce_loss": 0.0854712501168251, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "loss": 0.2254749983549118, + "step": 29640 + }, + { + "ce_loss": 0.03735177218914032, + "epoch": 9.886591060707138, + "step": 29640 + }, + { + "distill_loss": 0.12590903043746948, + "epoch": 9.886591060707138, + "step": 29640 + }, + { + "epoch": 9.886591060707138, + "ref_ce_loss": 0.06130973994731903, + "step": 29640 + }, + { + "epoch": 9.889926617745164, + "loss": 0.3663, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "grad_norm": 1.0769139528274536, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "learning_rate": 2.5416913893101526e-07, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "loss": 0.31788861751556396, + "step": 29650 + }, + { + "ce_loss": 0.026286710053682327, + "epoch": 9.889926617745164, + "step": 29650 + }, + { + "distill_loss": 0.17324940860271454, + "epoch": 9.889926617745164, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "ref_ce_loss": 0.058500248938798904, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "loss": 0.32835084199905396, + "step": 29650 + }, + { + "ce_loss": 0.02977113611996174, + "epoch": 9.889926617745164, + "step": 29650 + }, + { + "distill_loss": 0.15367326140403748, + "epoch": 9.889926617745164, + "step": 29650 + }, + { + "epoch": 9.889926617745164, + "ref_ce_loss": 0.04256702959537506, + "step": 29650 + }, + { + "epoch": 9.893262174783189, + "loss": 0.3122, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "grad_norm": 1.0725162029266357, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "learning_rate": 2.389998564581664e-07, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "loss": 0.29371413588523865, + "step": 29660 + }, + { + "ce_loss": 0.0522097647190094, + "epoch": 9.893262174783189, + "step": 29660 + }, + { + "distill_loss": 0.1740894466638565, + "epoch": 9.893262174783189, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "ref_ce_loss": 0.039615560322999954, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "loss": 0.36313530802726746, + "step": 29660 + }, + { + "ce_loss": 0.05687631294131279, + "epoch": 9.893262174783189, + "step": 29660 + }, + { + "distill_loss": 0.23256993293762207, + "epoch": 9.893262174783189, + "step": 29660 + }, + { + "epoch": 9.893262174783189, + "ref_ce_loss": 0.0732324868440628, + "step": 29660 + }, + { + "epoch": 9.896597731821213, + "loss": 0.3409, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "grad_norm": 0.787322998046875, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "learning_rate": 2.2429713808849174e-07, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "loss": 0.29244524240493774, + "step": 29670 + }, + { + "ce_loss": 0.04288395494222641, + "epoch": 9.896597731821213, + "step": 29670 + }, + { + "distill_loss": 0.18360666930675507, + "epoch": 9.896597731821213, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "ref_ce_loss": 0.06513893604278564, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "loss": 0.3685804009437561, + "step": 29670 + }, + { + "ce_loss": 0.04656131938099861, + "epoch": 9.896597731821213, + "step": 29670 + }, + { + "distill_loss": 0.165052130818367, + "epoch": 9.896597731821213, + "step": 29670 + }, + { + "epoch": 9.896597731821213, + "ref_ce_loss": 0.0664345920085907, + "step": 29670 + }, + { + "epoch": 9.89993328885924, + "loss": 0.3623, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "grad_norm": 0.9299017786979675, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "learning_rate": 2.1006100098173164e-07, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "loss": 0.3087444305419922, + "step": 29680 + }, + { + "ce_loss": 0.0318371057510376, + "epoch": 9.89993328885924, + "step": 29680 + }, + { + "distill_loss": 0.1628321409225464, + "epoch": 9.89993328885924, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "ref_ce_loss": 0.07229151576757431, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "loss": 0.7314703464508057, + "step": 29680 + }, + { + "ce_loss": 0.02534298598766327, + "epoch": 9.89993328885924, + "step": 29680 + }, + { + "distill_loss": 0.21285401284694672, + "epoch": 9.89993328885924, + "step": 29680 + }, + { + "epoch": 9.89993328885924, + "ref_ce_loss": 0.06899610161781311, + "step": 29680 + }, + { + "epoch": 9.903268845897266, + "loss": 0.3585, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "grad_norm": 1.0982376337051392, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "learning_rate": 1.9629146175295098e-07, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "loss": 0.3895324170589447, + "step": 29690 + }, + { + "ce_loss": 0.054450660943984985, + "epoch": 9.903268845897266, + "step": 29690 + }, + { + "distill_loss": 0.18156380951404572, + "epoch": 9.903268845897266, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "ref_ce_loss": 0.08041262626647949, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "loss": 0.26195472478866577, + "step": 29690 + }, + { + "ce_loss": 0.04622939974069595, + "epoch": 9.903268845897266, + "step": 29690 + }, + { + "distill_loss": 0.13963152468204498, + "epoch": 9.903268845897266, + "step": 29690 + }, + { + "epoch": 9.903268845897266, + "ref_ce_loss": 0.05275070294737816, + "step": 29690 + }, + { + "epoch": 9.90660440293529, + "loss": 0.3262, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "grad_norm": 2.3947901725769043, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "learning_rate": 1.8298853647267245e-07, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "loss": 0.24890460073947906, + "step": 29700 + }, + { + "ce_loss": 0.023720307275652885, + "epoch": 9.90660440293529, + "step": 29700 + }, + { + "distill_loss": 0.15437108278274536, + "epoch": 9.90660440293529, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "ref_ce_loss": 0.0504058375954628, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "loss": 0.28565430641174316, + "step": 29700 + }, + { + "ce_loss": 0.034484151750802994, + "epoch": 9.90660440293529, + "step": 29700 + }, + { + "distill_loss": 0.15133896470069885, + "epoch": 9.90660440293529, + "step": 29700 + }, + { + "epoch": 9.90660440293529, + "ref_ce_loss": 0.06469812989234924, + "step": 29700 + }, + { + "epoch": 9.909939959973315, + "loss": 0.312, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "grad_norm": 1.3704745769500732, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "learning_rate": 1.7015224066692092e-07, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "loss": 0.4401421546936035, + "step": 29710 + }, + { + "ce_loss": 0.07302363961935043, + "epoch": 9.909939959973315, + "step": 29710 + }, + { + "distill_loss": 0.22405001521110535, + "epoch": 9.909939959973315, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "ref_ce_loss": 0.06941098719835281, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "loss": 0.2773750424385071, + "step": 29710 + }, + { + "ce_loss": 0.037247832864522934, + "epoch": 9.909939959973315, + "step": 29710 + }, + { + "distill_loss": 0.15443992614746094, + "epoch": 9.909939959973315, + "step": 29710 + }, + { + "epoch": 9.909939959973315, + "ref_ce_loss": 0.062117595225572586, + "step": 29710 + }, + { + "epoch": 9.913275517011341, + "loss": 0.3324, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "grad_norm": 1.1411337852478027, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "learning_rate": 1.577825893169127e-07, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "loss": 0.32584357261657715, + "step": 29720 + }, + { + "ce_loss": 0.04434728994965553, + "epoch": 9.913275517011341, + "step": 29720 + }, + { + "distill_loss": 0.18526092171669006, + "epoch": 9.913275517011341, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "ref_ce_loss": 0.06991927325725555, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "loss": 0.2466956079006195, + "step": 29720 + }, + { + "ce_loss": 0.04231082275509834, + "epoch": 9.913275517011341, + "step": 29720 + }, + { + "distill_loss": 0.16381202638149261, + "epoch": 9.913275517011341, + "step": 29720 + }, + { + "epoch": 9.913275517011341, + "ref_ce_loss": 0.04034992679953575, + "step": 29720 + }, + { + "epoch": 9.916611074049367, + "loss": 0.3259, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "grad_norm": 0.9959713816642761, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "learning_rate": 1.4587959685945508e-07, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "loss": 0.2603667378425598, + "step": 29730 + }, + { + "ce_loss": 0.026754045858979225, + "epoch": 9.916611074049367, + "step": 29730 + }, + { + "distill_loss": 0.17127679288387299, + "epoch": 9.916611074049367, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "ref_ce_loss": 0.038138411939144135, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "loss": 0.29234760999679565, + "step": 29730 + }, + { + "ce_loss": 0.03642908111214638, + "epoch": 9.916611074049367, + "step": 29730 + }, + { + "distill_loss": 0.16722798347473145, + "epoch": 9.916611074049367, + "step": 29730 + }, + { + "epoch": 9.916611074049367, + "ref_ce_loss": 0.052527159452438354, + "step": 29730 + }, + { + "epoch": 9.919946631087392, + "loss": 0.3273, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "grad_norm": 0.7950869798660278, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "learning_rate": 1.3444327718659112e-07, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "loss": 0.3004519045352936, + "step": 29740 + }, + { + "ce_loss": 0.03613626956939697, + "epoch": 9.919946631087392, + "step": 29740 + }, + { + "distill_loss": 0.18733999133110046, + "epoch": 9.919946631087392, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "ref_ce_loss": 0.05546070635318756, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "loss": 0.23486210405826569, + "step": 29740 + }, + { + "ce_loss": 0.030922196805477142, + "epoch": 9.919946631087392, + "step": 29740 + }, + { + "distill_loss": 0.11665789783000946, + "epoch": 9.919946631087392, + "step": 29740 + }, + { + "epoch": 9.919946631087392, + "ref_ce_loss": 0.06070510670542717, + "step": 29740 + }, + { + "epoch": 9.923282188125416, + "loss": 0.3393, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "grad_norm": 0.9183117747306824, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "learning_rate": 1.2347364364573288e-07, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "loss": 0.27825382351875305, + "step": 29750 + }, + { + "ce_loss": 0.05024456977844238, + "epoch": 9.923282188125416, + "step": 29750 + }, + { + "distill_loss": 0.18313361704349518, + "epoch": 9.923282188125416, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "ref_ce_loss": 0.04465365782380104, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "loss": 0.28916436433792114, + "step": 29750 + }, + { + "ce_loss": 0.04393957555294037, + "epoch": 9.923282188125416, + "step": 29750 + }, + { + "distill_loss": 0.17849382758140564, + "epoch": 9.923282188125416, + "step": 29750 + }, + { + "epoch": 9.923282188125416, + "ref_ce_loss": 0.06638529151678085, + "step": 29750 + }, + { + "epoch": 9.926617745163442, + "loss": 0.3188, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "grad_norm": 1.2711544036865234, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "learning_rate": 1.1297070903966145e-07, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "loss": 0.37991073727607727, + "step": 29760 + }, + { + "ce_loss": 0.051403578370809555, + "epoch": 9.926617745163442, + "step": 29760 + }, + { + "distill_loss": 0.20777808129787445, + "epoch": 9.926617745163442, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "ref_ce_loss": 0.07273232191801071, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "loss": 0.3107052147388458, + "step": 29760 + }, + { + "ce_loss": 0.018291166052222252, + "epoch": 9.926617745163442, + "step": 29760 + }, + { + "distill_loss": 0.18891987204551697, + "epoch": 9.926617745163442, + "step": 29760 + }, + { + "epoch": 9.926617745163442, + "ref_ce_loss": 0.07224489748477936, + "step": 29760 + }, + { + "epoch": 9.929953302201469, + "loss": 0.3146, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "grad_norm": 1.1790691614151, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "learning_rate": 1.0293448562634922e-07, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "loss": 0.29821261763572693, + "step": 29770 + }, + { + "ce_loss": 0.03646909445524216, + "epoch": 9.929953302201469, + "step": 29770 + }, + { + "distill_loss": 0.18316808342933655, + "epoch": 9.929953302201469, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "ref_ce_loss": 0.05794721841812134, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "loss": 0.2972831130027771, + "step": 29770 + }, + { + "ce_loss": 0.05520421639084816, + "epoch": 9.929953302201469, + "step": 29770 + }, + { + "distill_loss": 0.15047815442085266, + "epoch": 9.929953302201469, + "step": 29770 + }, + { + "epoch": 9.929953302201469, + "ref_ce_loss": 0.07132155448198318, + "step": 29770 + }, + { + "epoch": 9.933288859239493, + "loss": 0.3568, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "grad_norm": 1.081078052520752, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "learning_rate": 9.336498511922643e-08, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "loss": 0.3281066119670868, + "step": 29780 + }, + { + "ce_loss": 0.0696646198630333, + "epoch": 9.933288859239493, + "step": 29780 + }, + { + "distill_loss": 0.16033565998077393, + "epoch": 9.933288859239493, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "ref_ce_loss": 0.0567629374563694, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "loss": 0.20319396257400513, + "step": 29780 + }, + { + "ce_loss": 0.025403108447790146, + "epoch": 9.933288859239493, + "step": 29780 + }, + { + "distill_loss": 0.11004454642534256, + "epoch": 9.933288859239493, + "step": 29780 + }, + { + "epoch": 9.933288859239493, + "ref_ce_loss": 0.052880749106407166, + "step": 29780 + }, + { + "epoch": 9.936624416277517, + "loss": 0.3406, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "grad_norm": 1.4391337633132935, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "learning_rate": 8.426221868687023e-08, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "loss": 0.34950166940689087, + "step": 29790 + }, + { + "ce_loss": 0.06122155115008354, + "epoch": 9.936624416277517, + "step": 29790 + }, + { + "distill_loss": 0.17466667294502258, + "epoch": 9.936624416277517, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "ref_ce_loss": 0.08305674046278, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "loss": 0.28921905159950256, + "step": 29790 + }, + { + "ce_loss": 0.030770935118198395, + "epoch": 9.936624416277517, + "step": 29790 + }, + { + "distill_loss": 0.1881738156080246, + "epoch": 9.936624416277517, + "step": 29790 + }, + { + "epoch": 9.936624416277517, + "ref_ce_loss": 0.04746009409427643, + "step": 29790 + }, + { + "epoch": 9.939959973315544, + "loss": 0.3373, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "grad_norm": 2.557058334350586, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "learning_rate": 7.562619695327122e-08, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "loss": 0.42486026883125305, + "step": 29800 + }, + { + "ce_loss": 0.056674372404813766, + "epoch": 9.939959973315544, + "step": 29800 + }, + { + "distill_loss": 0.19006691873073578, + "epoch": 9.939959973315544, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "ref_ce_loss": 0.06043754145503044, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "loss": 0.3627516031265259, + "step": 29800 + }, + { + "ce_loss": 0.041415825486183167, + "epoch": 9.939959973315544, + "step": 29800 + }, + { + "distill_loss": 0.19671253859996796, + "epoch": 9.939959973315544, + "step": 29800 + }, + { + "epoch": 9.939959973315544, + "ref_ce_loss": 0.08194684237241745, + "step": 29800 + }, + { + "epoch": 9.94329553035357, + "loss": 0.348, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "grad_norm": 1.006632924079895, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "learning_rate": 6.74569299974781e-08, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "loss": 0.22724904119968414, + "step": 29810 + }, + { + "ce_loss": 0.025708623230457306, + "epoch": 9.94329553035357, + "step": 29810 + }, + { + "distill_loss": 0.13873682916164398, + "epoch": 9.94329553035357, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "ref_ce_loss": 0.04845819249749184, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "loss": 0.23277434706687927, + "step": 29810 + }, + { + "ce_loss": 0.016086967661976814, + "epoch": 9.94329553035357, + "step": 29810 + }, + { + "distill_loss": 0.16063126921653748, + "epoch": 9.94329553035357, + "step": 29810 + }, + { + "epoch": 9.94329553035357, + "ref_ce_loss": 0.03908000513911247, + "step": 29810 + }, + { + "epoch": 9.946631087391594, + "loss": 0.3277, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "grad_norm": 1.4111415147781372, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "learning_rate": 5.975442735404179e-08, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "loss": 0.27478665113449097, + "step": 29820 + }, + { + "ce_loss": 0.033356938511133194, + "epoch": 9.946631087391594, + "step": 29820 + }, + { + "distill_loss": 0.1275174617767334, + "epoch": 9.946631087391594, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "ref_ce_loss": 0.07789397239685059, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "loss": 0.2878682613372803, + "step": 29820 + }, + { + "ce_loss": 0.041646961122751236, + "epoch": 9.946631087391594, + "step": 29820 + }, + { + "distill_loss": 0.16942068934440613, + "epoch": 9.946631087391594, + "step": 29820 + }, + { + "epoch": 9.946631087391594, + "ref_ce_loss": 0.05287106707692146, + "step": 29820 + }, + { + "epoch": 9.949966644429619, + "loss": 0.3172, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "grad_norm": 1.569103479385376, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "learning_rate": 5.251869801248255e-08, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "loss": 0.2843485474586487, + "step": 29830 + }, + { + "ce_loss": 0.02224954031407833, + "epoch": 9.949966644429619, + "step": 29830 + }, + { + "distill_loss": 0.14809651672840118, + "epoch": 9.949966644429619, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "ref_ce_loss": 0.05193183571100235, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "loss": 0.2797556519508362, + "step": 29830 + }, + { + "ce_loss": 0.042071759700775146, + "epoch": 9.949966644429619, + "step": 29830 + }, + { + "distill_loss": 0.1801556646823883, + "epoch": 9.949966644429619, + "step": 29830 + }, + { + "epoch": 9.949966644429619, + "ref_ce_loss": 0.057339444756507874, + "step": 29830 + }, + { + "epoch": 9.953302201467645, + "loss": 0.3358, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "grad_norm": 1.1686184406280518, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "learning_rate": 4.5749750417733994e-08, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "loss": 0.34728512167930603, + "step": 29840 + }, + { + "ce_loss": 0.028365690261125565, + "epoch": 9.953302201467645, + "step": 29840 + }, + { + "distill_loss": 0.2081349641084671, + "epoch": 9.953302201467645, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "ref_ce_loss": 0.07218490540981293, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "loss": 0.23002469539642334, + "step": 29840 + }, + { + "ce_loss": 0.03352981060743332, + "epoch": 9.953302201467645, + "step": 29840 + }, + { + "distill_loss": 0.14592669904232025, + "epoch": 9.953302201467645, + "step": 29840 + }, + { + "epoch": 9.953302201467645, + "ref_ce_loss": 0.03543630242347717, + "step": 29840 + }, + { + "epoch": 9.956637758505671, + "loss": 0.3346, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "grad_norm": 0.8662817478179932, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "learning_rate": 3.944759246992113e-08, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "loss": 0.345037043094635, + "step": 29850 + }, + { + "ce_loss": 0.04612068086862564, + "epoch": 9.956637758505671, + "step": 29850 + }, + { + "distill_loss": 0.19077233970165253, + "epoch": 9.956637758505671, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "ref_ce_loss": 0.056209057569503784, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "loss": 0.3140699565410614, + "step": 29850 + }, + { + "ce_loss": 0.04668711870908737, + "epoch": 9.956637758505671, + "step": 29850 + }, + { + "distill_loss": 0.18919126689434052, + "epoch": 9.956637758505671, + "step": 29850 + }, + { + "epoch": 9.956637758505671, + "ref_ce_loss": 0.05338788405060768, + "step": 29850 + }, + { + "epoch": 9.959973315543696, + "loss": 0.351, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "grad_norm": 0.947829008102417, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "learning_rate": 3.361223152427151e-08, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "loss": 0.21806228160858154, + "step": 29860 + }, + { + "ce_loss": 0.03290942311286926, + "epoch": 9.959973315543696, + "step": 29860 + }, + { + "distill_loss": 0.140326589345932, + "epoch": 9.959973315543696, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "ref_ce_loss": 0.04470537230372429, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "loss": 0.37957972288131714, + "step": 29860 + }, + { + "ce_loss": 0.04370058327913284, + "epoch": 9.959973315543696, + "step": 29860 + }, + { + "distill_loss": 0.18686765432357788, + "epoch": 9.959973315543696, + "step": 29860 + }, + { + "epoch": 9.959973315543696, + "ref_ce_loss": 0.07017166912555695, + "step": 29860 + }, + { + "epoch": 9.96330887258172, + "loss": 0.3735, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "grad_norm": 1.2191232442855835, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "learning_rate": 2.824367439133724e-08, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "loss": 0.3440263867378235, + "step": 29870 + }, + { + "ce_loss": 0.05631376802921295, + "epoch": 9.96330887258172, + "step": 29870 + }, + { + "distill_loss": 0.19631989300251007, + "epoch": 9.96330887258172, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "ref_ce_loss": 0.07191712409257889, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "loss": 0.2361232340335846, + "step": 29870 + }, + { + "ce_loss": 0.023900402709841728, + "epoch": 9.96330887258172, + "step": 29870 + }, + { + "distill_loss": 0.15150588750839233, + "epoch": 9.96330887258172, + "step": 29870 + }, + { + "epoch": 9.96330887258172, + "ref_ce_loss": 0.04112594574689865, + "step": 29870 + }, + { + "epoch": 9.966644429619747, + "loss": 0.3224, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "grad_norm": 0.8823342323303223, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "learning_rate": 2.3341927336772986e-08, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "loss": 0.42281824350357056, + "step": 29880 + }, + { + "ce_loss": 0.03692351654171944, + "epoch": 9.966644429619747, + "step": 29880 + }, + { + "distill_loss": 0.22779785096645355, + "epoch": 9.966644429619747, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "ref_ce_loss": 0.06586943566799164, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "loss": 0.2905508279800415, + "step": 29880 + }, + { + "ce_loss": 0.011483977548778057, + "epoch": 9.966644429619747, + "step": 29880 + }, + { + "distill_loss": 0.18474937975406647, + "epoch": 9.966644429619747, + "step": 29880 + }, + { + "epoch": 9.966644429619747, + "ref_ce_loss": 0.07037250697612762, + "step": 29880 + }, + { + "epoch": 9.969979986657773, + "loss": 0.3249, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "grad_norm": 0.8157691359519958, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "learning_rate": 1.8906996081424765e-08, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "loss": 0.9363209009170532, + "step": 29890 + }, + { + "ce_loss": 0.04010238125920296, + "epoch": 9.969979986657773, + "step": 29890 + }, + { + "distill_loss": 0.12882179021835327, + "epoch": 9.969979986657773, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "ref_ce_loss": 0.042074572294950485, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "loss": 0.20374956727027893, + "step": 29890 + }, + { + "ce_loss": 0.018373459577560425, + "epoch": 9.969979986657773, + "step": 29890 + }, + { + "distill_loss": 0.14011499285697937, + "epoch": 9.969979986657773, + "step": 29890 + }, + { + "epoch": 9.969979986657773, + "ref_ce_loss": 0.030234120786190033, + "step": 29890 + }, + { + "epoch": 9.973315543695797, + "loss": 0.3742, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "grad_norm": 0.848705530166626, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "learning_rate": 1.493888580137437e-08, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "loss": 0.2602086365222931, + "step": 29900 + }, + { + "ce_loss": 0.03535810858011246, + "epoch": 9.973315543695797, + "step": 29900 + }, + { + "distill_loss": 0.14325125515460968, + "epoch": 9.973315543695797, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "ref_ce_loss": 0.053540270775556564, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "loss": 0.22060787677764893, + "step": 29900 + }, + { + "ce_loss": 0.02888793684542179, + "epoch": 9.973315543695797, + "step": 29900 + }, + { + "distill_loss": 0.13869284093379974, + "epoch": 9.973315543695797, + "step": 29900 + }, + { + "epoch": 9.973315543695797, + "ref_ce_loss": 0.0529271699488163, + "step": 29900 + }, + { + "epoch": 9.976651100733822, + "loss": 0.3507, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "grad_norm": 1.8673821687698364, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "learning_rate": 1.1437601127850527e-08, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "loss": 0.5325973033905029, + "step": 29910 + }, + { + "ce_loss": 0.08878213167190552, + "epoch": 9.976651100733822, + "step": 29910 + }, + { + "distill_loss": 0.230632483959198, + "epoch": 9.976651100733822, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "ref_ce_loss": 0.08264369517564774, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "loss": 0.3474941551685333, + "step": 29910 + }, + { + "ce_loss": 0.038783978670835495, + "epoch": 9.976651100733822, + "step": 29910 + }, + { + "distill_loss": 0.18959704041481018, + "epoch": 9.976651100733822, + "step": 29910 + }, + { + "epoch": 9.976651100733822, + "ref_ce_loss": 0.05137256532907486, + "step": 29910 + }, + { + "epoch": 9.979986657771848, + "loss": 0.3746, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "grad_norm": 0.875569224357605, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "learning_rate": 8.403146147140106e-09, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "loss": 0.2730117440223694, + "step": 29920 + }, + { + "ce_loss": 0.03304222598671913, + "epoch": 9.979986657771848, + "step": 29920 + }, + { + "distill_loss": 0.17824025452136993, + "epoch": 9.979986657771848, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "ref_ce_loss": 0.06153935194015503, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "loss": 0.2830379605293274, + "step": 29920 + }, + { + "ce_loss": 0.024901097640395164, + "epoch": 9.979986657771848, + "step": 29920 + }, + { + "distill_loss": 0.16472214460372925, + "epoch": 9.979986657771848, + "step": 29920 + }, + { + "epoch": 9.979986657771848, + "ref_ce_loss": 0.05382291227579117, + "step": 29920 + }, + { + "epoch": 9.983322214809874, + "loss": 0.3348, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "grad_norm": 1.073372721672058, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "learning_rate": 5.835524400854553e-09, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "loss": 0.3343769907951355, + "step": 29930 + }, + { + "ce_loss": 0.04512088745832443, + "epoch": 9.983322214809874, + "step": 29930 + }, + { + "distill_loss": 0.13358773291110992, + "epoch": 9.983322214809874, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "ref_ce_loss": 0.07107693701982498, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "loss": 0.29759302735328674, + "step": 29930 + }, + { + "ce_loss": 0.025530952960252762, + "epoch": 9.983322214809874, + "step": 29930 + }, + { + "distill_loss": 0.16844472289085388, + "epoch": 9.983322214809874, + "step": 29930 + }, + { + "epoch": 9.983322214809874, + "ref_ce_loss": 0.060609109699726105, + "step": 29930 + }, + { + "epoch": 9.986657771847899, + "loss": 0.3355, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "grad_norm": 9.1760835647583, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "learning_rate": 3.7347388857078554e-09, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "loss": 0.29515504837036133, + "step": 29940 + }, + { + "ce_loss": 0.037243470549583435, + "epoch": 9.986657771847899, + "step": 29940 + }, + { + "distill_loss": 0.1859862208366394, + "epoch": 9.986657771847899, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "ref_ce_loss": 0.04966844245791435, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "loss": 0.3100915849208832, + "step": 29940 + }, + { + "ce_loss": 0.039968203753232956, + "epoch": 9.986657771847899, + "step": 29940 + }, + { + "distill_loss": 0.19227924942970276, + "epoch": 9.986657771847899, + "step": 29940 + }, + { + "epoch": 9.986657771847899, + "ref_ce_loss": 0.038896676152944565, + "step": 29940 + }, + { + "epoch": 9.989993328885923, + "loss": 0.37, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "grad_norm": 1.4707958698272705, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "learning_rate": 2.1007920534277248e-09, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "loss": 0.4190083146095276, + "step": 29950 + }, + { + "ce_loss": 0.021757889539003372, + "epoch": 9.989993328885923, + "step": 29950 + }, + { + "distill_loss": 0.17043279111385345, + "epoch": 9.989993328885923, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "ref_ce_loss": 0.0661436915397644, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "loss": 0.4909171462059021, + "step": 29950 + }, + { + "ce_loss": 0.02767323888838291, + "epoch": 9.989993328885923, + "step": 29950 + }, + { + "distill_loss": 0.1564040184020996, + "epoch": 9.989993328885923, + "step": 29950 + }, + { + "epoch": 9.989993328885923, + "ref_ce_loss": 0.06023760512471199, + "step": 29950 + }, + { + "epoch": 9.99332888592395, + "loss": 0.3398, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "grad_norm": 1.0020558834075928, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "learning_rate": 9.336858111552715e-10, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "loss": 0.2585368752479553, + "step": 29960 + }, + { + "ce_loss": 0.032630253583192825, + "epoch": 9.99332888592395, + "step": 29960 + }, + { + "distill_loss": 0.1443522572517395, + "epoch": 9.99332888592395, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "ref_ce_loss": 0.030707091093063354, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "loss": 0.31847187876701355, + "step": 29960 + }, + { + "ce_loss": 0.02757086232304573, + "epoch": 9.99332888592395, + "step": 29960 + }, + { + "distill_loss": 0.18047651648521423, + "epoch": 9.99332888592395, + "step": 29960 + }, + { + "epoch": 9.99332888592395, + "ref_ce_loss": 0.07202086597681046, + "step": 29960 + }, + { + "epoch": 9.996664442961976, + "loss": 0.3786, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "grad_norm": 1.3275409936904907, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "learning_rate": 2.3342152091210265e-10, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "loss": 0.26913413405418396, + "step": 29970 + }, + { + "ce_loss": 0.058668166399002075, + "epoch": 9.996664442961976, + "step": 29970 + }, + { + "distill_loss": 0.15468889474868774, + "epoch": 9.996664442961976, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "ref_ce_loss": 0.039234500378370285, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "loss": 0.5042160749435425, + "step": 29970 + }, + { + "ce_loss": 0.036590516567230225, + "epoch": 9.996664442961976, + "step": 29970 + }, + { + "distill_loss": 0.20055048167705536, + "epoch": 9.996664442961976, + "step": 29970 + }, + { + "epoch": 9.996664442961976, + "ref_ce_loss": 0.051263727247714996, + "step": 29970 + }, + { + "epoch": 10.0, + "loss": 0.3577, + "step": 29980 + }, + { + "epoch": 10.0, + "grad_norm": 1.942399501800537, + "step": 29980 + }, + { + "epoch": 10.0, + "learning_rate": 0.0, + "step": 29980 + }, + { + "epoch": 10.0, + "step": 29980, + "train_runtime": 69684.4226 + }, + { + "epoch": 10.0, + "step": 29980, + "train_samples_per_second": 55.066 + }, + { + "epoch": 10.0, + "step": 29980, + "train_steps_per_second": 0.43 + }, + { + "epoch": 10.0, + "step": 29980, + "total_flos": 0.0 + }, + { + "epoch": 10.0, + "step": 29980, + "train_loss": 0.8395808646009316 + } + ], + "logging_steps": 10, + "max_steps": 29980, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}