{ "best_metric": 0.541167345355827, "best_model_checkpoint": "/content/drive/MyDrive/Inventariado_Mobiliario_Urbano/cars_brand_model/cmmy/checkpoint-2260", "epoch": 9.996692392502757, "eval_steps": 500, "global_step": 2260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.044101433296582136, "grad_norm": 4.8150153160095215, "learning_rate": 4.424778761061947e-05, "loss": 33.4771, "step": 10 }, { "epoch": 0.08820286659316427, "grad_norm": 4.733597278594971, "learning_rate": 8.849557522123894e-05, "loss": 33.1339, "step": 20 }, { "epoch": 0.13230429988974643, "grad_norm": 4.442318439483643, "learning_rate": 0.00013274336283185842, "loss": 32.4884, "step": 30 }, { "epoch": 0.17640573318632854, "grad_norm": 4.178449630737305, "learning_rate": 0.00017699115044247788, "loss": 31.6444, "step": 40 }, { "epoch": 0.2205071664829107, "grad_norm": 4.139791965484619, "learning_rate": 0.00022123893805309737, "loss": 30.7515, "step": 50 }, { "epoch": 0.26460859977949286, "grad_norm": 4.265659332275391, "learning_rate": 0.00026548672566371683, "loss": 29.8171, "step": 60 }, { "epoch": 0.308710033076075, "grad_norm": 4.5766921043396, "learning_rate": 0.00030973451327433627, "loss": 28.6353, "step": 70 }, { "epoch": 0.3528114663726571, "grad_norm": 5.000542640686035, "learning_rate": 0.00035398230088495576, "loss": 27.6647, "step": 80 }, { "epoch": 0.39691289966923926, "grad_norm": 4.588747501373291, "learning_rate": 0.00039823008849557525, "loss": 26.439, "step": 90 }, { "epoch": 0.4410143329658214, "grad_norm": 5.457170486450195, "learning_rate": 0.00044247787610619474, "loss": 25.3025, "step": 100 }, { "epoch": 0.48511576626240355, "grad_norm": 5.326786518096924, "learning_rate": 0.0004867256637168142, "loss": 24.4399, "step": 110 }, { "epoch": 0.5292171995589857, "grad_norm": 5.408795356750488, "learning_rate": 0.0005309734513274337, "loss": 23.267, "step": 120 }, { "epoch": 0.5733186328555678, "grad_norm": 5.888697624206543, "learning_rate": 0.0005752212389380532, "loss": 22.239, "step": 130 }, { "epoch": 0.61742006615215, "grad_norm": 6.699647903442383, "learning_rate": 0.0006194690265486725, "loss": 21.443, "step": 140 }, { "epoch": 0.6615214994487321, "grad_norm": 5.940579891204834, "learning_rate": 0.0006637168141592921, "loss": 20.2496, "step": 150 }, { "epoch": 0.7056229327453142, "grad_norm": 6.729783058166504, "learning_rate": 0.0007079646017699115, "loss": 19.7215, "step": 160 }, { "epoch": 0.7497243660418964, "grad_norm": 7.232001781463623, "learning_rate": 0.0007522123893805309, "loss": 18.8327, "step": 170 }, { "epoch": 0.7938257993384785, "grad_norm": 6.139113426208496, "learning_rate": 0.0007964601769911505, "loss": 18.3153, "step": 180 }, { "epoch": 0.8379272326350606, "grad_norm": 6.233616828918457, "learning_rate": 0.0008407079646017699, "loss": 17.7032, "step": 190 }, { "epoch": 0.8820286659316428, "grad_norm": 6.897170543670654, "learning_rate": 0.0008849557522123895, "loss": 16.9044, "step": 200 }, { "epoch": 0.9261300992282249, "grad_norm": 6.186814785003662, "learning_rate": 0.0009292035398230089, "loss": 16.6313, "step": 210 }, { "epoch": 0.9702315325248071, "grad_norm": 6.237786293029785, "learning_rate": 0.0009734513274336283, "loss": 16.0903, "step": 220 }, { "epoch": 0.9966923925027563, "eval_accuracy": 0.19693620321892574, "eval_loss": 3.4455316066741943, "eval_runtime": 157.5981, "eval_samples_per_second": 163.612, "eval_steps_per_second": 0.641, "step": 226 }, { "epoch": 1.017640573318633, "grad_norm": 5.7300639152526855, "learning_rate": 0.0009980334316617503, "loss": 16.2564, "step": 230 }, { "epoch": 1.061742006615215, "grad_norm": 6.365210056304932, "learning_rate": 0.000993117010816126, "loss": 14.7841, "step": 240 }, { "epoch": 1.1058434399117971, "grad_norm": 6.162064552307129, "learning_rate": 0.0009882005899705015, "loss": 14.7691, "step": 250 }, { "epoch": 1.1499448732083792, "grad_norm": 6.440285682678223, "learning_rate": 0.0009832841691248771, "loss": 14.1738, "step": 260 }, { "epoch": 1.1940463065049614, "grad_norm": 5.520109176635742, "learning_rate": 0.0009783677482792527, "loss": 14.0754, "step": 270 }, { "epoch": 1.2381477398015435, "grad_norm": 6.146816730499268, "learning_rate": 0.0009734513274336283, "loss": 13.9686, "step": 280 }, { "epoch": 1.2822491730981258, "grad_norm": 5.696677207946777, "learning_rate": 0.000968534906588004, "loss": 13.6213, "step": 290 }, { "epoch": 1.326350606394708, "grad_norm": 5.706403732299805, "learning_rate": 0.0009636184857423795, "loss": 13.2106, "step": 300 }, { "epoch": 1.37045203969129, "grad_norm": 6.3477067947387695, "learning_rate": 0.0009587020648967551, "loss": 13.1036, "step": 310 }, { "epoch": 1.4145534729878722, "grad_norm": 6.014346599578857, "learning_rate": 0.0009537856440511307, "loss": 13.0983, "step": 320 }, { "epoch": 1.4586549062844543, "grad_norm": 5.742127895355225, "learning_rate": 0.0009488692232055063, "loss": 12.6127, "step": 330 }, { "epoch": 1.5027563395810364, "grad_norm": 5.869048595428467, "learning_rate": 0.0009439528023598821, "loss": 12.7718, "step": 340 }, { "epoch": 1.5468577728776185, "grad_norm": 5.569489479064941, "learning_rate": 0.0009390363815142577, "loss": 12.4881, "step": 350 }, { "epoch": 1.5909592061742006, "grad_norm": 6.744396686553955, "learning_rate": 0.0009341199606686333, "loss": 12.3667, "step": 360 }, { "epoch": 1.6350606394707827, "grad_norm": 5.721745014190674, "learning_rate": 0.0009292035398230089, "loss": 12.169, "step": 370 }, { "epoch": 1.6791620727673648, "grad_norm": 6.00828742980957, "learning_rate": 0.0009242871189773845, "loss": 12.163, "step": 380 }, { "epoch": 1.723263506063947, "grad_norm": 6.328344345092773, "learning_rate": 0.0009193706981317601, "loss": 11.9877, "step": 390 }, { "epoch": 1.767364939360529, "grad_norm": 5.9762492179870605, "learning_rate": 0.0009144542772861358, "loss": 11.8959, "step": 400 }, { "epoch": 1.8114663726571112, "grad_norm": 5.462543487548828, "learning_rate": 0.0009095378564405114, "loss": 11.8443, "step": 410 }, { "epoch": 1.8555678059536935, "grad_norm": 5.628457069396973, "learning_rate": 0.000904621435594887, "loss": 11.7006, "step": 420 }, { "epoch": 1.8996692392502756, "grad_norm": 5.508877754211426, "learning_rate": 0.0008997050147492626, "loss": 11.6014, "step": 430 }, { "epoch": 1.9437706725468578, "grad_norm": 5.510633945465088, "learning_rate": 0.0008947885939036382, "loss": 11.4442, "step": 440 }, { "epoch": 1.9878721058434399, "grad_norm": 5.567264080047607, "learning_rate": 0.0008898721730580139, "loss": 11.5438, "step": 450 }, { "epoch": 1.9966923925027564, "eval_accuracy": 0.299166181888695, "eval_loss": 2.4033162593841553, "eval_runtime": 157.919, "eval_samples_per_second": 163.28, "eval_steps_per_second": 0.64, "step": 452 }, { "epoch": 2.035281146637266, "grad_norm": 5.653784275054932, "learning_rate": 0.0008849557522123895, "loss": 11.2957, "step": 460 }, { "epoch": 2.079382579933848, "grad_norm": 5.515305519104004, "learning_rate": 0.0008800393313667651, "loss": 10.4382, "step": 470 }, { "epoch": 2.12348401323043, "grad_norm": 5.540599346160889, "learning_rate": 0.0008751229105211407, "loss": 10.2655, "step": 480 }, { "epoch": 2.167585446527012, "grad_norm": 5.588015079498291, "learning_rate": 0.0008702064896755162, "loss": 10.4359, "step": 490 }, { "epoch": 2.2116868798235942, "grad_norm": 5.268083095550537, "learning_rate": 0.0008652900688298918, "loss": 10.4373, "step": 500 }, { "epoch": 2.2557883131201764, "grad_norm": 5.2158427238464355, "learning_rate": 0.0008603736479842675, "loss": 10.3617, "step": 510 }, { "epoch": 2.2998897464167585, "grad_norm": 5.967540264129639, "learning_rate": 0.0008554572271386431, "loss": 10.343, "step": 520 }, { "epoch": 2.3439911797133406, "grad_norm": 6.330170154571533, "learning_rate": 0.0008505408062930187, "loss": 10.3236, "step": 530 }, { "epoch": 2.3880926130099227, "grad_norm": 5.394957065582275, "learning_rate": 0.0008456243854473943, "loss": 10.1741, "step": 540 }, { "epoch": 2.432194046306505, "grad_norm": 5.637348651885986, "learning_rate": 0.0008407079646017699, "loss": 10.1634, "step": 550 }, { "epoch": 2.476295479603087, "grad_norm": 5.205583572387695, "learning_rate": 0.0008357915437561455, "loss": 10.0739, "step": 560 }, { "epoch": 2.5203969128996695, "grad_norm": 5.155258655548096, "learning_rate": 0.0008308751229105212, "loss": 10.1469, "step": 570 }, { "epoch": 2.5644983461962516, "grad_norm": 5.169893741607666, "learning_rate": 0.0008259587020648968, "loss": 10.0088, "step": 580 }, { "epoch": 2.6085997794928337, "grad_norm": 5.327938556671143, "learning_rate": 0.0008210422812192724, "loss": 9.9356, "step": 590 }, { "epoch": 2.652701212789416, "grad_norm": 5.036328315734863, "learning_rate": 0.000816125860373648, "loss": 9.8759, "step": 600 }, { "epoch": 2.696802646085998, "grad_norm": 5.075291633605957, "learning_rate": 0.0008112094395280236, "loss": 9.8351, "step": 610 }, { "epoch": 2.74090407938258, "grad_norm": 5.24298620223999, "learning_rate": 0.0008062930186823992, "loss": 9.6878, "step": 620 }, { "epoch": 2.785005512679162, "grad_norm": 4.871809005737305, "learning_rate": 0.0008013765978367749, "loss": 9.9713, "step": 630 }, { "epoch": 2.8291069459757443, "grad_norm": 4.931336879730225, "learning_rate": 0.0007964601769911505, "loss": 9.8542, "step": 640 }, { "epoch": 2.8732083792723264, "grad_norm": 5.142650127410889, "learning_rate": 0.0007915437561455261, "loss": 9.8176, "step": 650 }, { "epoch": 2.9173098125689085, "grad_norm": 5.100261211395264, "learning_rate": 0.0007866273352999017, "loss": 9.5995, "step": 660 }, { "epoch": 2.9614112458654906, "grad_norm": 4.906562805175781, "learning_rate": 0.0007817109144542773, "loss": 9.5963, "step": 670 }, { "epoch": 2.9966923925027564, "eval_accuracy": 0.3491564863292612, "eval_loss": 2.0761232376098633, "eval_runtime": 157.9952, "eval_samples_per_second": 163.201, "eval_steps_per_second": 0.639, "step": 678 }, { "epoch": 3.0088202866593163, "grad_norm": 4.665123462677002, "learning_rate": 0.000776794493608653, "loss": 9.8294, "step": 680 }, { "epoch": 3.0529217199558984, "grad_norm": 5.142998695373535, "learning_rate": 0.0007718780727630286, "loss": 8.6602, "step": 690 }, { "epoch": 3.0970231532524806, "grad_norm": 4.909497261047363, "learning_rate": 0.0007669616519174042, "loss": 8.5178, "step": 700 }, { "epoch": 3.1411245865490627, "grad_norm": 4.914758682250977, "learning_rate": 0.0007620452310717798, "loss": 8.5225, "step": 710 }, { "epoch": 3.185226019845645, "grad_norm": 4.778907299041748, "learning_rate": 0.0007571288102261554, "loss": 8.6442, "step": 720 }, { "epoch": 3.2293274531422274, "grad_norm": 4.7863240242004395, "learning_rate": 0.0007522123893805309, "loss": 8.6121, "step": 730 }, { "epoch": 3.2734288864388095, "grad_norm": 4.866987228393555, "learning_rate": 0.0007472959685349066, "loss": 8.6773, "step": 740 }, { "epoch": 3.3175303197353916, "grad_norm": 4.968977451324463, "learning_rate": 0.0007423795476892822, "loss": 8.6663, "step": 750 }, { "epoch": 3.3616317530319737, "grad_norm": 4.805993556976318, "learning_rate": 0.0007374631268436578, "loss": 8.6653, "step": 760 }, { "epoch": 3.405733186328556, "grad_norm": 4.871044635772705, "learning_rate": 0.0007325467059980334, "loss": 8.5913, "step": 770 }, { "epoch": 3.449834619625138, "grad_norm": 5.305060863494873, "learning_rate": 0.000727630285152409, "loss": 8.4683, "step": 780 }, { "epoch": 3.49393605292172, "grad_norm": 4.906181812286377, "learning_rate": 0.0007227138643067846, "loss": 8.5715, "step": 790 }, { "epoch": 3.538037486218302, "grad_norm": 4.955236434936523, "learning_rate": 0.0007177974434611603, "loss": 8.5461, "step": 800 }, { "epoch": 3.5821389195148843, "grad_norm": 4.7065863609313965, "learning_rate": 0.0007128810226155359, "loss": 8.5517, "step": 810 }, { "epoch": 3.6262403528114664, "grad_norm": 4.8048930168151855, "learning_rate": 0.0007079646017699115, "loss": 8.6689, "step": 820 }, { "epoch": 3.6703417861080485, "grad_norm": 5.108726978302002, "learning_rate": 0.0007030481809242871, "loss": 8.4989, "step": 830 }, { "epoch": 3.7144432194046306, "grad_norm": 4.784049034118652, "learning_rate": 0.0006981317600786627, "loss": 8.5436, "step": 840 }, { "epoch": 3.7585446527012127, "grad_norm": 4.953752517700195, "learning_rate": 0.0006932153392330383, "loss": 8.3707, "step": 850 }, { "epoch": 3.802646085997795, "grad_norm": 4.866833686828613, "learning_rate": 0.000688298918387414, "loss": 8.5086, "step": 860 }, { "epoch": 3.846747519294377, "grad_norm": 4.966568946838379, "learning_rate": 0.0006833824975417896, "loss": 8.3376, "step": 870 }, { "epoch": 3.890848952590959, "grad_norm": 4.906131744384766, "learning_rate": 0.0006784660766961652, "loss": 8.3634, "step": 880 }, { "epoch": 3.934950385887541, "grad_norm": 4.776595115661621, "learning_rate": 0.0006735496558505408, "loss": 8.3092, "step": 890 }, { "epoch": 3.9790518191841233, "grad_norm": 4.59867525100708, "learning_rate": 0.0006686332350049164, "loss": 8.3873, "step": 900 }, { "epoch": 3.9966923925027564, "eval_accuracy": 0.3934458018227652, "eval_loss": 1.892139196395874, "eval_runtime": 157.998, "eval_samples_per_second": 163.198, "eval_steps_per_second": 0.639, "step": 904 }, { "epoch": 4.0264608599779494, "grad_norm": 4.579000473022461, "learning_rate": 0.0006637168141592921, "loss": 8.0129, "step": 910 }, { "epoch": 4.070562293274532, "grad_norm": 4.628970623016357, "learning_rate": 0.0006588003933136677, "loss": 7.3258, "step": 920 }, { "epoch": 4.114663726571114, "grad_norm": 4.4933953285217285, "learning_rate": 0.0006538839724680433, "loss": 7.3052, "step": 930 }, { "epoch": 4.158765159867696, "grad_norm": 4.572768211364746, "learning_rate": 0.0006489675516224189, "loss": 7.3455, "step": 940 }, { "epoch": 4.202866593164278, "grad_norm": 4.468959331512451, "learning_rate": 0.0006440511307767945, "loss": 7.3741, "step": 950 }, { "epoch": 4.24696802646086, "grad_norm": 4.594937324523926, "learning_rate": 0.00063913470993117, "loss": 7.3599, "step": 960 }, { "epoch": 4.291069459757442, "grad_norm": 4.6465935707092285, "learning_rate": 0.0006342182890855457, "loss": 7.3735, "step": 970 }, { "epoch": 4.335170893054024, "grad_norm": 4.683464527130127, "learning_rate": 0.0006293018682399213, "loss": 7.447, "step": 980 }, { "epoch": 4.379272326350606, "grad_norm": 4.466670989990234, "learning_rate": 0.0006243854473942969, "loss": 7.4649, "step": 990 }, { "epoch": 4.4233737596471885, "grad_norm": 4.653200149536133, "learning_rate": 0.0006194690265486725, "loss": 7.3462, "step": 1000 }, { "epoch": 4.467475192943771, "grad_norm": 4.518767356872559, "learning_rate": 0.0006145526057030481, "loss": 7.2768, "step": 1010 }, { "epoch": 4.511576626240353, "grad_norm": 4.512480735778809, "learning_rate": 0.0006096361848574237, "loss": 7.306, "step": 1020 }, { "epoch": 4.555678059536935, "grad_norm": 4.512141704559326, "learning_rate": 0.0006047197640117994, "loss": 7.3665, "step": 1030 }, { "epoch": 4.599779492833517, "grad_norm": 4.769530773162842, "learning_rate": 0.000599803343166175, "loss": 7.4248, "step": 1040 }, { "epoch": 4.643880926130099, "grad_norm": 4.538891792297363, "learning_rate": 0.0005948869223205506, "loss": 7.3548, "step": 1050 }, { "epoch": 4.687982359426681, "grad_norm": 4.579529762268066, "learning_rate": 0.0005899705014749262, "loss": 7.1825, "step": 1060 }, { "epoch": 4.732083792723263, "grad_norm": 4.407522201538086, "learning_rate": 0.0005850540806293018, "loss": 7.3086, "step": 1070 }, { "epoch": 4.776185226019845, "grad_norm": 4.507376194000244, "learning_rate": 0.0005801376597836774, "loss": 7.4489, "step": 1080 }, { "epoch": 4.8202866593164275, "grad_norm": 4.580904960632324, "learning_rate": 0.0005752212389380532, "loss": 7.2829, "step": 1090 }, { "epoch": 4.86438809261301, "grad_norm": 4.334172248840332, "learning_rate": 0.0005703048180924288, "loss": 7.3591, "step": 1100 }, { "epoch": 4.908489525909592, "grad_norm": 4.6015095710754395, "learning_rate": 0.0005653883972468044, "loss": 7.3586, "step": 1110 }, { "epoch": 4.952590959206174, "grad_norm": 4.470212936401367, "learning_rate": 0.00056047197640118, "loss": 7.3276, "step": 1120 }, { "epoch": 4.996692392502756, "grad_norm": 4.509373664855957, "learning_rate": 0.0005555555555555556, "loss": 7.3127, "step": 1130 }, { "epoch": 4.996692392502756, "eval_accuracy": 0.42877642039945707, "eval_loss": 1.7534652948379517, "eval_runtime": 157.6275, "eval_samples_per_second": 163.582, "eval_steps_per_second": 0.641, "step": 1130 }, { "epoch": 5.044101433296582, "grad_norm": 4.370789527893066, "learning_rate": 0.0005506391347099313, "loss": 6.6219, "step": 1140 }, { "epoch": 5.088202866593164, "grad_norm": 4.587760925292969, "learning_rate": 0.0005457227138643069, "loss": 6.382, "step": 1150 }, { "epoch": 5.132304299889746, "grad_norm": 4.437044143676758, "learning_rate": 0.0005408062930186825, "loss": 6.3493, "step": 1160 }, { "epoch": 5.1764057331863285, "grad_norm": 4.376111030578613, "learning_rate": 0.0005358898721730581, "loss": 6.3537, "step": 1170 }, { "epoch": 5.220507166482911, "grad_norm": 4.685988903045654, "learning_rate": 0.0005309734513274337, "loss": 6.3465, "step": 1180 }, { "epoch": 5.264608599779493, "grad_norm": 4.62017297744751, "learning_rate": 0.0005260570304818093, "loss": 6.3306, "step": 1190 }, { "epoch": 5.308710033076075, "grad_norm": 4.6863112449646, "learning_rate": 0.0005211406096361849, "loss": 6.4408, "step": 1200 }, { "epoch": 5.352811466372657, "grad_norm": 4.470117092132568, "learning_rate": 0.0005162241887905605, "loss": 6.4383, "step": 1210 }, { "epoch": 5.396912899669239, "grad_norm": 4.46852445602417, "learning_rate": 0.0005113077679449361, "loss": 6.2687, "step": 1220 }, { "epoch": 5.441014332965821, "grad_norm": 4.40092134475708, "learning_rate": 0.0005063913470993117, "loss": 6.3344, "step": 1230 }, { "epoch": 5.485115766262403, "grad_norm": 4.5052595138549805, "learning_rate": 0.0005014749262536873, "loss": 6.3289, "step": 1240 }, { "epoch": 5.529217199558985, "grad_norm": 4.2100419998168945, "learning_rate": 0.000496558505408063, "loss": 6.4074, "step": 1250 }, { "epoch": 5.5733186328555675, "grad_norm": 4.383902549743652, "learning_rate": 0.0004916420845624386, "loss": 6.3748, "step": 1260 }, { "epoch": 5.61742006615215, "grad_norm": 4.348648548126221, "learning_rate": 0.0004867256637168142, "loss": 6.2771, "step": 1270 }, { "epoch": 5.661521499448732, "grad_norm": 4.597408294677734, "learning_rate": 0.0004818092428711898, "loss": 6.4728, "step": 1280 }, { "epoch": 5.705622932745314, "grad_norm": 4.483145236968994, "learning_rate": 0.0004768928220255654, "loss": 6.3158, "step": 1290 }, { "epoch": 5.749724366041896, "grad_norm": 4.486307144165039, "learning_rate": 0.00047197640117994103, "loss": 6.3443, "step": 1300 }, { "epoch": 5.793825799338478, "grad_norm": 4.527980804443359, "learning_rate": 0.00046705998033431663, "loss": 6.306, "step": 1310 }, { "epoch": 5.83792723263506, "grad_norm": 4.423805236816406, "learning_rate": 0.00046214355948869223, "loss": 6.2912, "step": 1320 }, { "epoch": 5.882028665931642, "grad_norm": 4.3704752922058105, "learning_rate": 0.0004572271386430679, "loss": 6.2006, "step": 1330 }, { "epoch": 5.926130099228224, "grad_norm": 4.610422611236572, "learning_rate": 0.0004523107177974435, "loss": 6.2838, "step": 1340 }, { "epoch": 5.970231532524807, "grad_norm": 4.506854057312012, "learning_rate": 0.0004473942969518191, "loss": 6.2178, "step": 1350 }, { "epoch": 5.996692392502756, "eval_accuracy": 0.4608881132441342, "eval_loss": 1.6532503366470337, "eval_runtime": 157.0061, "eval_samples_per_second": 164.229, "eval_steps_per_second": 0.643, "step": 1356 }, { "epoch": 6.017640573318633, "grad_norm": 4.212904930114746, "learning_rate": 0.00044247787610619474, "loss": 6.1728, "step": 1360 }, { "epoch": 6.061742006615215, "grad_norm": 4.3509931564331055, "learning_rate": 0.00043756145526057034, "loss": 5.4675, "step": 1370 }, { "epoch": 6.105843439911797, "grad_norm": 4.346857070922852, "learning_rate": 0.0004326450344149459, "loss": 5.4391, "step": 1380 }, { "epoch": 6.149944873208379, "grad_norm": 4.269370079040527, "learning_rate": 0.00042772861356932154, "loss": 5.6166, "step": 1390 }, { "epoch": 6.194046306504961, "grad_norm": 4.528676986694336, "learning_rate": 0.00042281219272369714, "loss": 5.4683, "step": 1400 }, { "epoch": 6.238147739801543, "grad_norm": 4.301924705505371, "learning_rate": 0.00041789577187807274, "loss": 5.6475, "step": 1410 }, { "epoch": 6.282249173098125, "grad_norm": 4.408478736877441, "learning_rate": 0.0004129793510324484, "loss": 5.5012, "step": 1420 }, { "epoch": 6.3263506063947075, "grad_norm": 4.205230712890625, "learning_rate": 0.000408062930186824, "loss": 5.4751, "step": 1430 }, { "epoch": 6.37045203969129, "grad_norm": 4.417777061462402, "learning_rate": 0.0004031465093411996, "loss": 5.4574, "step": 1440 }, { "epoch": 6.414553472987872, "grad_norm": 4.562683582305908, "learning_rate": 0.00039823008849557525, "loss": 5.5681, "step": 1450 }, { "epoch": 6.458654906284455, "grad_norm": 4.4911112785339355, "learning_rate": 0.00039331366764995085, "loss": 5.5747, "step": 1460 }, { "epoch": 6.502756339581037, "grad_norm": 4.3542890548706055, "learning_rate": 0.0003883972468043265, "loss": 5.4824, "step": 1470 }, { "epoch": 6.546857772877619, "grad_norm": 4.291258811950684, "learning_rate": 0.0003834808259587021, "loss": 5.3834, "step": 1480 }, { "epoch": 6.590959206174201, "grad_norm": 4.43005895614624, "learning_rate": 0.0003785644051130777, "loss": 5.4797, "step": 1490 }, { "epoch": 6.635060639470783, "grad_norm": 4.461911201477051, "learning_rate": 0.0003736479842674533, "loss": 5.5312, "step": 1500 }, { "epoch": 6.679162072767365, "grad_norm": 4.246000289916992, "learning_rate": 0.0003687315634218289, "loss": 5.5821, "step": 1510 }, { "epoch": 6.723263506063947, "grad_norm": 4.318939685821533, "learning_rate": 0.0003638151425762045, "loss": 5.5103, "step": 1520 }, { "epoch": 6.7673649393605295, "grad_norm": 4.241775989532471, "learning_rate": 0.00035889872173058016, "loss": 5.4244, "step": 1530 }, { "epoch": 6.811466372657112, "grad_norm": 4.246124267578125, "learning_rate": 0.00035398230088495576, "loss": 5.4415, "step": 1540 }, { "epoch": 6.855567805953694, "grad_norm": 4.474425792694092, "learning_rate": 0.00034906588003933136, "loss": 5.4259, "step": 1550 }, { "epoch": 6.899669239250276, "grad_norm": 4.427676677703857, "learning_rate": 0.000344149459193707, "loss": 5.4803, "step": 1560 }, { "epoch": 6.943770672546858, "grad_norm": 4.402575969696045, "learning_rate": 0.0003392330383480826, "loss": 5.3709, "step": 1570 }, { "epoch": 6.98787210584344, "grad_norm": 4.089934349060059, "learning_rate": 0.0003343166175024582, "loss": 5.4619, "step": 1580 }, { "epoch": 6.996692392502756, "eval_accuracy": 0.4855923986814039, "eval_loss": 1.5859359502792358, "eval_runtime": 157.4699, "eval_samples_per_second": 163.746, "eval_steps_per_second": 0.641, "step": 1582 }, { "epoch": 7.035281146637265, "grad_norm": 4.048954963684082, "learning_rate": 0.00032940019665683387, "loss": 4.9956, "step": 1590 }, { "epoch": 7.0793825799338475, "grad_norm": 4.142811298370361, "learning_rate": 0.00032448377581120947, "loss": 4.637, "step": 1600 }, { "epoch": 7.12348401323043, "grad_norm": 4.157500267028809, "learning_rate": 0.000319567354965585, "loss": 4.7621, "step": 1610 }, { "epoch": 7.167585446527012, "grad_norm": 4.244820594787598, "learning_rate": 0.00031465093411996067, "loss": 4.671, "step": 1620 }, { "epoch": 7.211686879823595, "grad_norm": 4.291553497314453, "learning_rate": 0.00030973451327433627, "loss": 4.7641, "step": 1630 }, { "epoch": 7.255788313120177, "grad_norm": 4.3009233474731445, "learning_rate": 0.00030481809242871187, "loss": 4.7801, "step": 1640 }, { "epoch": 7.299889746416759, "grad_norm": 4.151345729827881, "learning_rate": 0.0002999016715830875, "loss": 4.7008, "step": 1650 }, { "epoch": 7.343991179713341, "grad_norm": 4.11318826675415, "learning_rate": 0.0002949852507374631, "loss": 4.5749, "step": 1660 }, { "epoch": 7.388092613009923, "grad_norm": 4.272958755493164, "learning_rate": 0.0002900688298918387, "loss": 4.7279, "step": 1670 }, { "epoch": 7.432194046306505, "grad_norm": 4.370232105255127, "learning_rate": 0.0002851524090462144, "loss": 4.6739, "step": 1680 }, { "epoch": 7.476295479603087, "grad_norm": 4.224992752075195, "learning_rate": 0.00028023598820059, "loss": 4.6791, "step": 1690 }, { "epoch": 7.5203969128996695, "grad_norm": 4.131043434143066, "learning_rate": 0.00027531956735496563, "loss": 4.742, "step": 1700 }, { "epoch": 7.564498346196252, "grad_norm": 4.325319290161133, "learning_rate": 0.00027040314650934123, "loss": 4.6848, "step": 1710 }, { "epoch": 7.608599779492834, "grad_norm": 4.375499248504639, "learning_rate": 0.00026548672566371683, "loss": 4.7289, "step": 1720 }, { "epoch": 7.652701212789416, "grad_norm": 4.138423442840576, "learning_rate": 0.00026057030481809243, "loss": 4.7507, "step": 1730 }, { "epoch": 7.696802646085998, "grad_norm": 4.30584716796875, "learning_rate": 0.00025565388397246803, "loss": 4.7498, "step": 1740 }, { "epoch": 7.74090407938258, "grad_norm": 4.196134567260742, "learning_rate": 0.00025073746312684363, "loss": 4.6912, "step": 1750 }, { "epoch": 7.785005512679162, "grad_norm": 4.179983615875244, "learning_rate": 0.0002458210422812193, "loss": 4.6327, "step": 1760 }, { "epoch": 7.829106945975744, "grad_norm": 4.482637405395508, "learning_rate": 0.0002409046214355949, "loss": 4.7178, "step": 1770 }, { "epoch": 7.873208379272326, "grad_norm": 4.053654670715332, "learning_rate": 0.00023598820058997051, "loss": 4.6353, "step": 1780 }, { "epoch": 7.9173098125689085, "grad_norm": 4.109724998474121, "learning_rate": 0.00023107177974434611, "loss": 4.6531, "step": 1790 }, { "epoch": 7.961411245865491, "grad_norm": 4.190576553344727, "learning_rate": 0.00022615535889872174, "loss": 4.618, "step": 1800 }, { "epoch": 7.996692392502756, "eval_accuracy": 0.5128950940469265, "eval_loss": 1.52533757686615, "eval_runtime": 159.0581, "eval_samples_per_second": 162.111, "eval_steps_per_second": 0.635, "step": 1808 }, { "epoch": 8.008820286659317, "grad_norm": 3.9392812252044678, "learning_rate": 0.00022123893805309737, "loss": 4.6935, "step": 1810 }, { "epoch": 8.052921719955899, "grad_norm": 3.8019185066223145, "learning_rate": 0.00021632251720747294, "loss": 3.9004, "step": 1820 }, { "epoch": 8.097023153252481, "grad_norm": 3.8372962474823, "learning_rate": 0.00021140609636184857, "loss": 4.0764, "step": 1830 }, { "epoch": 8.141124586549063, "grad_norm": 4.013173580169678, "learning_rate": 0.0002064896755162242, "loss": 3.921, "step": 1840 }, { "epoch": 8.185226019845645, "grad_norm": 4.06724739074707, "learning_rate": 0.0002015732546705998, "loss": 4.1819, "step": 1850 }, { "epoch": 8.229327453142227, "grad_norm": 3.967484474182129, "learning_rate": 0.00019665683382497542, "loss": 4.1189, "step": 1860 }, { "epoch": 8.27342888643881, "grad_norm": 3.8714957237243652, "learning_rate": 0.00019174041297935105, "loss": 4.0143, "step": 1870 }, { "epoch": 8.317530319735392, "grad_norm": 3.9495551586151123, "learning_rate": 0.00018682399213372665, "loss": 4.1982, "step": 1880 }, { "epoch": 8.361631753031974, "grad_norm": 4.08094596862793, "learning_rate": 0.00018190757128810225, "loss": 4.0732, "step": 1890 }, { "epoch": 8.405733186328556, "grad_norm": 3.997749090194702, "learning_rate": 0.00017699115044247788, "loss": 4.0657, "step": 1900 }, { "epoch": 8.449834619625138, "grad_norm": 3.9958107471466064, "learning_rate": 0.0001720747295968535, "loss": 4.0651, "step": 1910 }, { "epoch": 8.49393605292172, "grad_norm": 4.039346218109131, "learning_rate": 0.0001671583087512291, "loss": 4.0022, "step": 1920 }, { "epoch": 8.538037486218302, "grad_norm": 4.07537317276001, "learning_rate": 0.00016224188790560473, "loss": 4.0364, "step": 1930 }, { "epoch": 8.582138919514884, "grad_norm": 3.949800491333008, "learning_rate": 0.00015732546705998033, "loss": 4.0273, "step": 1940 }, { "epoch": 8.626240352811466, "grad_norm": 3.9785361289978027, "learning_rate": 0.00015240904621435593, "loss": 4.1008, "step": 1950 }, { "epoch": 8.670341786108049, "grad_norm": 4.0533318519592285, "learning_rate": 0.00014749262536873156, "loss": 3.9856, "step": 1960 }, { "epoch": 8.71444321940463, "grad_norm": 3.9734625816345215, "learning_rate": 0.0001425762045231072, "loss": 3.9327, "step": 1970 }, { "epoch": 8.758544652701213, "grad_norm": 4.144639015197754, "learning_rate": 0.00013765978367748282, "loss": 4.1105, "step": 1980 }, { "epoch": 8.802646085997795, "grad_norm": 4.224411487579346, "learning_rate": 0.00013274336283185842, "loss": 4.0302, "step": 1990 }, { "epoch": 8.846747519294377, "grad_norm": 4.210290908813477, "learning_rate": 0.00012782694198623402, "loss": 3.9702, "step": 2000 }, { "epoch": 8.890848952590959, "grad_norm": 3.948779821395874, "learning_rate": 0.00012291052114060964, "loss": 4.0179, "step": 2010 }, { "epoch": 8.934950385887541, "grad_norm": 4.020263671875, "learning_rate": 0.00011799410029498526, "loss": 3.9426, "step": 2020 }, { "epoch": 8.979051819184123, "grad_norm": 3.9727530479431152, "learning_rate": 0.00011307767944936087, "loss": 3.9349, "step": 2030 }, { "epoch": 8.996692392502757, "eval_accuracy": 0.5314717859220477, "eval_loss": 1.4693009853363037, "eval_runtime": 158.202, "eval_samples_per_second": 162.988, "eval_steps_per_second": 0.638, "step": 2034 }, { "epoch": 9.026460859977949, "grad_norm": 4.023420333862305, "learning_rate": 0.00010816125860373647, "loss": 4.0029, "step": 2040 }, { "epoch": 9.07056229327453, "grad_norm": 3.8481717109680176, "learning_rate": 0.0001032448377581121, "loss": 3.5067, "step": 2050 }, { "epoch": 9.114663726571113, "grad_norm": 3.807828187942505, "learning_rate": 9.832841691248771e-05, "loss": 3.5354, "step": 2060 }, { "epoch": 9.158765159867695, "grad_norm": 3.808962106704712, "learning_rate": 9.341199606686333e-05, "loss": 3.6032, "step": 2070 }, { "epoch": 9.202866593164277, "grad_norm": 3.6250662803649902, "learning_rate": 8.849557522123894e-05, "loss": 3.5225, "step": 2080 }, { "epoch": 9.24696802646086, "grad_norm": 3.9860610961914062, "learning_rate": 8.357915437561455e-05, "loss": 3.6428, "step": 2090 }, { "epoch": 9.291069459757441, "grad_norm": 3.9992544651031494, "learning_rate": 7.866273352999017e-05, "loss": 3.4771, "step": 2100 }, { "epoch": 9.335170893054023, "grad_norm": 3.782989740371704, "learning_rate": 7.374631268436578e-05, "loss": 3.506, "step": 2110 }, { "epoch": 9.379272326350605, "grad_norm": 3.9225122928619385, "learning_rate": 6.882989183874141e-05, "loss": 3.4989, "step": 2120 }, { "epoch": 9.42337375964719, "grad_norm": 3.853450298309326, "learning_rate": 6.391347099311701e-05, "loss": 3.49, "step": 2130 }, { "epoch": 9.467475192943771, "grad_norm": 3.887131452560425, "learning_rate": 5.899705014749263e-05, "loss": 3.5517, "step": 2140 }, { "epoch": 9.511576626240354, "grad_norm": 3.7988171577453613, "learning_rate": 5.4080629301868236e-05, "loss": 3.4626, "step": 2150 }, { "epoch": 9.555678059536936, "grad_norm": 3.80769419670105, "learning_rate": 4.9164208456243856e-05, "loss": 3.5627, "step": 2160 }, { "epoch": 9.599779492833518, "grad_norm": 3.883881092071533, "learning_rate": 4.424778761061947e-05, "loss": 3.5904, "step": 2170 }, { "epoch": 9.6438809261301, "grad_norm": 3.729762554168701, "learning_rate": 3.9331366764995083e-05, "loss": 3.5366, "step": 2180 }, { "epoch": 9.687982359426682, "grad_norm": 3.7784054279327393, "learning_rate": 3.4414945919370704e-05, "loss": 3.5186, "step": 2190 }, { "epoch": 9.732083792723264, "grad_norm": 3.8702545166015625, "learning_rate": 2.9498525073746314e-05, "loss": 3.385, "step": 2200 }, { "epoch": 9.776185226019846, "grad_norm": 3.82987380027771, "learning_rate": 2.4582104228121928e-05, "loss": 3.4502, "step": 2210 }, { "epoch": 9.820286659316428, "grad_norm": 3.7813146114349365, "learning_rate": 1.9665683382497542e-05, "loss": 3.4203, "step": 2220 }, { "epoch": 9.86438809261301, "grad_norm": 3.8914685249328613, "learning_rate": 1.4749262536873157e-05, "loss": 3.5059, "step": 2230 }, { "epoch": 9.908489525909593, "grad_norm": 3.8533997535705566, "learning_rate": 9.832841691248771e-06, "loss": 3.483, "step": 2240 }, { "epoch": 9.952590959206175, "grad_norm": 3.7153797149658203, "learning_rate": 4.9164208456243854e-06, "loss": 3.5258, "step": 2250 }, { "epoch": 9.996692392502757, "grad_norm": 3.670651435852051, "learning_rate": 0.0, "loss": 3.4667, "step": 2260 }, { "epoch": 9.996692392502757, "eval_accuracy": 0.541167345355827, "eval_loss": 1.4404356479644775, "eval_runtime": 159.5598, "eval_samples_per_second": 161.601, "eval_steps_per_second": 0.633, "step": 2260 }, { "epoch": 9.996692392502757, "step": 2260, "total_flos": 1.8565430446260014e+20, "train_loss": 8.705859319298668, "train_runtime": 25889.9118, "train_samples_per_second": 89.633, "train_steps_per_second": 0.087 }, { "epoch": 9.996692392502757, "step": 2260, "total_flos": 1.8565430446260014e+20, "train_loss": 0.0, "train_runtime": 9.3339, "train_samples_per_second": 248619.036, "train_steps_per_second": 242.129 } ], "logging_steps": 10, "max_steps": 2260, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8565430446260014e+20, "train_batch_size": 256, "trial_name": null, "trial_params": null }