{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:44:54.030340', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 513.7015351300394, 'timestamp': '2025-09-04 03:44:54.032582', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:44:54.153228', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.22628124058246613, 'timestamp': '2025-09-04 03:44:54.166141', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:44:54.277663', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.18759363889694214, 'timestamp': '2025-09-04 03:44:54.296326', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:44:54.403899', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.21276673674583435, 'timestamp': '2025-09-04 03:44:54.422364', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:44:54.529920', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.20896945893764496, 'timestamp': '2025-09-04 03:44:54.584227', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:44:54.682359', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.15049788355827332, 'timestamp': '2025-09-04 03:44:54.701147', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:44:54.795660', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.221171572804451, 'timestamp': '2025-09-04 03:44:54.812689', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:44:54.925572', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.13799867033958435, 'timestamp': '2025-09-04 03:44:54.945627', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:44:55.048805', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.254111111164093, 'timestamp': '2025-09-04 03:44:55.068403', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:44:55.176185', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.2516261041164398, 'timestamp': '2025-09-04 03:44:55.197753', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:44:55.292267', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.11826080828905106, 'timestamp': '2025-09-04 03:44:55.309480', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:44:55.392164', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.3149326741695404, 'timestamp': '2025-09-04 03:44:55.404837', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:44:55.501241', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.1865367889404297, 'timestamp': '2025-09-04 03:44:55.519423', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:44:55.618360', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.22903558611869812, 'timestamp': '2025-09-04 03:44:55.638547', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:44:55.727492', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.1957457810640335, 'timestamp': '2025-09-04 03:44:55.743023', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:44:55.850708', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.26458948850631714, 'timestamp': '2025-09-04 03:44:55.870711', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:44:55.956573', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.19251839816570282, 'timestamp': '2025-09-04 03:44:55.972768', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:44:56.065326', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.169407457113266, 'timestamp': '2025-09-04 03:44:56.084337', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:44:56.164607', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.1738697588443756, 'timestamp': '2025-09-04 03:44:56.178386', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:44:56.250786', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.12069570273160934, 'timestamp': '2025-09-04 03:44:56.263511', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:44:56.374549', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.28613629937171936, 'timestamp': '2025-09-04 03:44:56.395709', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:04.776963', 'step': 20, 'epoch': 1} {'type': 'pplx', 'content': 459.69697261034077, 'timestamp': '2025-09-04 03:45:04.779321', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:04.883567', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.0930766835808754, 'timestamp': '2025-09-04 03:45:04.905592', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1392], 'flops': 27840169073088.0}, 'timestamp': '2025-09-04 03:45:05.111656', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.16422075033187866, 'timestamp': '2025-09-04 03:45:05.150856', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:45:05.262285', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.15494759380817413, 'timestamp': '2025-09-04 03:45:05.282894', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 03:45:05.496961', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.1941477507352829, 'timestamp': '2025-09-04 03:45:05.538559', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:05.640198', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.046016380190849304, 'timestamp': '2025-09-04 03:45:05.661299', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:05.770082', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.26754552125930786, 'timestamp': '2025-09-04 03:45:05.789791', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:05.887422', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.2157849818468094, 'timestamp': '2025-09-04 03:45:05.903902', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:06.003259', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.09500601142644882, 'timestamp': '2025-09-04 03:45:06.022367', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:06.127436', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.2806503176689148, 'timestamp': '2025-09-04 03:45:06.149677', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:45:06.261111', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.147545725107193, 'timestamp': '2025-09-04 03:45:06.281302', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:06.383742', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.0946376770734787, 'timestamp': '2025-09-04 03:45:06.402770', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:06.498965', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.07105350494384766, 'timestamp': '2025-09-04 03:45:06.516556', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:45:06.595117', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.05865024775266647, 'timestamp': '2025-09-04 03:45:06.609671', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:06.700887', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.04186881706118584, 'timestamp': '2025-09-04 03:45:06.717379', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:06.820494', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.1130460873246193, 'timestamp': '2025-09-04 03:45:06.839402', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:06.935561', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.08032827824354172, 'timestamp': '2025-09-04 03:45:06.953667', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:07.060582', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.10274317115545273, 'timestamp': '2025-09-04 03:45:07.082672', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:07.192812', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.05926857143640518, 'timestamp': '2025-09-04 03:45:07.212899', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:45:07.301385', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.09572148323059082, 'timestamp': '2025-09-04 03:45:07.316491', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:45:07.394036', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.06614067405462265, 'timestamp': '2025-09-04 03:45:07.408562', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:15.977735', 'step': 40, 'epoch': 1} {'type': 'pplx', 'content': 389.98537419869405, 'timestamp': '2025-09-04 03:45:15.980248', 'step': 40, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 40', 'timestamp': '2025-09-04 03:45:16.522945', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:45:16.607576', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.0893501564860344, 'timestamp': '2025-09-04 03:45:16.624424', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:45:16.725320', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.10664358735084534, 'timestamp': '2025-09-04 03:45:16.743948', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:16.850040', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.11798786371946335, 'timestamp': '2025-09-04 03:45:16.869875', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:45:16.959614', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.09886734187602997, 'timestamp': '2025-09-04 03:45:16.975202', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:45:17.049375', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.040978167206048965, 'timestamp': '2025-09-04 03:45:17.064184', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:17.173784', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.06903672963380814, 'timestamp': '2025-09-04 03:45:17.193812', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:17.296985', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.02300487831234932, 'timestamp': '2025-09-04 03:45:17.316264', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:17.408720', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.0908779725432396, 'timestamp': '2025-09-04 03:45:17.426033', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:17.523711', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.08729575574398041, 'timestamp': '2025-09-04 03:45:17.543919', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:17.637324', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.0870480164885521, 'timestamp': '2025-09-04 03:45:17.654175', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:17.756185', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.050713177770376205, 'timestamp': '2025-09-04 03:45:17.775418', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:45:17.857774', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.04926443472504616, 'timestamp': '2025-09-04 03:45:17.873580', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:17.976932', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.014587458223104477, 'timestamp': '2025-09-04 03:45:17.998752', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:18.096464', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.0563230998814106, 'timestamp': '2025-09-04 03:45:18.113944', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:18.204053', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.052430395036935806, 'timestamp': '2025-09-04 03:45:18.220905', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:45:18.299562', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.08462968468666077, 'timestamp': '2025-09-04 03:45:18.314267', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:45:18.410745', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.06088142469525337, 'timestamp': '2025-09-04 03:45:18.431360', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:18.531885', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.10583172738552094, 'timestamp': '2025-09-04 03:45:18.550360', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:18.645711', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.05197317525744438, 'timestamp': '2025-09-04 03:45:18.663022', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:18.767990', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.0938020795583725, 'timestamp': '2025-09-04 03:45:18.788566', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:27.161836', 'step': 60, 'epoch': 1} {'type': 'pplx', 'content': 344.9719250300418, 'timestamp': '2025-09-04 03:45:27.164144', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:27.256830', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.047754935920238495, 'timestamp': '2025-09-04 03:45:27.275797', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:45:27.384552', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.08348787575960159, 'timestamp': '2025-09-04 03:45:27.404855', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:27.496674', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.048718009144067764, 'timestamp': '2025-09-04 03:45:27.513341', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 03:45:27.732444', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.06368561834096909, 'timestamp': '2025-09-04 03:45:27.775457', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:45:27.866602', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.047636643052101135, 'timestamp': '2025-09-04 03:45:27.885576', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:27.994542', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.09618725627660751, 'timestamp': '2025-09-04 03:45:28.014601', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 03:45:28.146160', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.09339220821857452, 'timestamp': '2025-09-04 03:45:28.169472', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:28.276347', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.05055764317512512, 'timestamp': '2025-09-04 03:45:28.296999', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:45:28.379869', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.06613060086965561, 'timestamp': '2025-09-04 03:45:28.396302', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:28.498772', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.08002032339572906, 'timestamp': '2025-09-04 03:45:28.517928', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:45:28.595425', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.09714193642139435, 'timestamp': '2025-09-04 03:45:28.609410', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:28.716648', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.048158854246139526, 'timestamp': '2025-09-04 03:45:28.737622', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:28.833604', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.17001864314079285, 'timestamp': '2025-09-04 03:45:28.854016', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:45:28.963395', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.025268767029047012, 'timestamp': '2025-09-04 03:45:28.983767', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:29.089863', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.025647073984146118, 'timestamp': '2025-09-04 03:45:29.109647', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:45:29.193050', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.06310443580150604, 'timestamp': '2025-09-04 03:45:29.208797', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:45:29.289171', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.006233109161257744, 'timestamp': '2025-09-04 03:45:29.305528', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:29.403738', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.03743808716535568, 'timestamp': '2025-09-04 03:45:29.422112', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:45:29.532220', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.06183940917253494, 'timestamp': '2025-09-04 03:45:29.552634', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:29.658591', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.03081091120839119, 'timestamp': '2025-09-04 03:45:29.679007', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:38.054098', 'step': 80, 'epoch': 1} {'type': 'pplx', 'content': 318.24303809489555, 'timestamp': '2025-09-04 03:45:38.056443', 'step': 80, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 80', 'timestamp': '2025-09-04 03:45:38.556351', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:38.660607', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.0675966814160347, 'timestamp': '2025-09-04 03:45:38.682606', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:38.778116', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.07781155407428741, 'timestamp': '2025-09-04 03:45:38.795394', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:38.899758', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.06105361878871918, 'timestamp': '2025-09-04 03:45:38.918706', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:39.026173', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.049994468688964844, 'timestamp': '2025-09-04 03:45:39.046706', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:39.138861', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.1184161901473999, 'timestamp': '2025-09-04 03:45:39.157849', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:45:39.243936', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.07634230703115463, 'timestamp': '2025-09-04 03:45:39.259348', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:39.361762', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.03923124074935913, 'timestamp': '2025-09-04 03:45:39.380754', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:45:39.458872', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.0754866749048233, 'timestamp': '2025-09-04 03:45:39.473629', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:45:39.542590', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.049534834921360016, 'timestamp': '2025-09-04 03:45:39.556461', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:45:39.634576', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.09451133012771606, 'timestamp': '2025-09-04 03:45:39.648463', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:39.750807', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.1034846305847168, 'timestamp': '2025-09-04 03:45:39.770033', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:45:39.855459', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.060452286154031754, 'timestamp': '2025-09-04 03:45:39.871436', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:39.961786', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.05689983442425728, 'timestamp': '2025-09-04 03:45:39.980357', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:45:40.109208', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.009844101965427399, 'timestamp': '2025-09-04 03:45:40.132173', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:45:40.214872', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.08469025045633316, 'timestamp': '2025-09-04 03:45:40.229769', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:40.328739', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.10152354091405869, 'timestamp': '2025-09-04 03:45:40.347831', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:45:40.445260', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.0582207627594471, 'timestamp': '2025-09-04 03:45:40.465784', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:45:40.580598', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.05192510411143303, 'timestamp': '2025-09-04 03:45:40.600367', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:40.700989', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.09743145853281021, 'timestamp': '2025-09-04 03:45:40.719330', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:40.815781', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.07235053181648254, 'timestamp': '2025-09-04 03:45:40.833849', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:49.188649', 'step': 100, 'epoch': 1} {'type': 'pplx', 'content': 307.3854100186044, 'timestamp': '2025-09-04 03:45:49.190883', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:45:49.324428', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.04918394982814789, 'timestamp': '2025-09-04 03:45:49.353243', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:45:49.444040', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.12214594334363937, 'timestamp': '2025-09-04 03:45:49.460765', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:45:49.561386', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.044912759214639664, 'timestamp': '2025-09-04 03:45:49.579949', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:49.681534', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.14099526405334473, 'timestamp': '2025-09-04 03:45:49.701325', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:45:49.783498', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.0638093501329422, 'timestamp': '2025-09-04 03:45:49.800247', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:45:49.904552', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.013638557866215706, 'timestamp': '2025-09-04 03:45:49.923592', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:45:50.006927', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.07801222056150436, 'timestamp': '2025-09-04 03:45:50.022042', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:45:50.144157', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.06590745598077774, 'timestamp': '2025-09-04 03:45:50.166685', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:50.262360', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.11835378408432007, 'timestamp': '2025-09-04 03:45:50.282746', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:50.376627', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.12223128229379654, 'timestamp': '2025-09-04 03:45:50.393565', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:45:50.504278', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.1280670166015625, 'timestamp': '2025-09-04 03:45:50.524729', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:45:50.623814', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.06618773937225342, 'timestamp': '2025-09-04 03:45:50.642958', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:50.734068', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.04645576328039169, 'timestamp': '2025-09-04 03:45:50.752647', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:45:50.829181', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.13508065044879913, 'timestamp': '2025-09-04 03:45:50.842752', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:45:50.934855', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.0594618059694767, 'timestamp': '2025-09-04 03:45:50.951746', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:45:51.053664', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.11379887908697128, 'timestamp': '2025-09-04 03:45:51.073378', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:45:51.164518', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.039490919560194016, 'timestamp': '2025-09-04 03:45:51.183413', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:45:51.291693', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.04585837572813034, 'timestamp': '2025-09-04 03:45:51.311758', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:45:51.407070', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.07603704929351807, 'timestamp': '2025-09-04 03:45:51.424283', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:45:51.540887', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.016811968758702278, 'timestamp': '2025-09-04 03:45:51.563695', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:45:59.943776', 'step': 120, 'epoch': 1} {'type': 'pplx', 'content': 299.89610834083794, 'timestamp': '2025-09-04 03:45:59.945489', 'step': 120, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 120', 'timestamp': '2025-09-04 03:46:00.290401', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 03:46:00.476561', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.08362013101577759, 'timestamp': '2025-09-04 03:46:00.514696', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:46:00.596482', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.059237729758024216, 'timestamp': '2025-09-04 03:46:00.611683', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:46:00.727922', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.039004936814308167, 'timestamp': '2025-09-04 03:46:00.749930', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:46:00.824487', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.11705737560987473, 'timestamp': '2025-09-04 03:46:00.838610', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:00.914119', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.06630095839500427, 'timestamp': '2025-09-04 03:46:00.929469', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:46:01.019324', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.04516802728176117, 'timestamp': '2025-09-04 03:46:01.035861', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:01.144582', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.04330586642026901, 'timestamp': '2025-09-04 03:46:01.165051', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:46:01.259475', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.061153244227170944, 'timestamp': '2025-09-04 03:46:01.277402', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:01.374266', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.11837725341320038, 'timestamp': '2025-09-04 03:46:01.394579', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:46:01.476506', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.02547391690313816, 'timestamp': '2025-09-04 03:46:01.491380', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:01.598388', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.03671078011393547, 'timestamp': '2025-09-04 03:46:01.618545', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:46:01.711084', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.15452733635902405, 'timestamp': '2025-09-04 03:46:01.728810', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:01.804783', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.07617174834012985, 'timestamp': '2025-09-04 03:46:01.820045', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:01.920338', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.062410350888967514, 'timestamp': '2025-09-04 03:46:01.938924', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:46:02.054545', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.0330992229282856, 'timestamp': '2025-09-04 03:46:02.076694', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:46:02.186634', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.029934488236904144, 'timestamp': '2025-09-04 03:46:02.207952', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:02.306911', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.048758380115032196, 'timestamp': '2025-09-04 03:46:02.327997', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:02.424443', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.11269187927246094, 'timestamp': '2025-09-04 03:46:02.441749', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:02.543693', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.0775558203458786, 'timestamp': '2025-09-04 03:46:02.562600', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:02.658416', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.05858410522341728, 'timestamp': '2025-09-04 03:46:02.676400', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:46:11.045850', 'step': 140, 'epoch': 1} {'type': 'pplx', 'content': 295.5688961078636, 'timestamp': '2025-09-04 03:46:11.048049', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:11.145509', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.034822121262550354, 'timestamp': '2025-09-04 03:46:11.166258', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:11.275463', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.025513645261526108, 'timestamp': '2025-09-04 03:46:11.295707', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:11.405349', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.022908439859747887, 'timestamp': '2025-09-04 03:46:11.425606', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:11.531596', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.05994102731347084, 'timestamp': '2025-09-04 03:46:11.552142', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 03:46:11.680734', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.06833264976739883, 'timestamp': '2025-09-04 03:46:11.707754', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:11.809626', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.07911453396081924, 'timestamp': '2025-09-04 03:46:11.828239', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:11.934765', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.08064809441566467, 'timestamp': '2025-09-04 03:46:11.954473', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:12.062843', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.03310004621744156, 'timestamp': '2025-09-04 03:46:12.083696', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:46:12.166474', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.03188398852944374, 'timestamp': '2025-09-04 03:46:12.183268', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:12.293783', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.03804198279976845, 'timestamp': '2025-09-04 03:46:12.314141', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:12.418609', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.03391795977950096, 'timestamp': '2025-09-04 03:46:12.437588', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:12.515641', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.024908579885959625, 'timestamp': '2025-09-04 03:46:12.530339', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:46:12.621956', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.0160362645983696, 'timestamp': '2025-09-04 03:46:12.640839', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:12.741410', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.030184214934706688, 'timestamp': '2025-09-04 03:46:12.760113', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:12.839419', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.03809965401887894, 'timestamp': '2025-09-04 03:46:12.853334', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:12.953947', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.03310762345790863, 'timestamp': '2025-09-04 03:46:12.973351', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:13.079957', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.05042213574051857, 'timestamp': '2025-09-04 03:46:13.102326', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:46:13.223808', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.03846399486064911, 'timestamp': '2025-09-04 03:46:13.245799', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:13.354628', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.030467255041003227, 'timestamp': '2025-09-04 03:46:13.374922', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:13.482931', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.07905561476945877, 'timestamp': '2025-09-04 03:46:13.503834', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:46:21.876993', 'step': 160, 'epoch': 1} {'type': 'pplx', 'content': 299.0262854255139, 'timestamp': '2025-09-04 03:46:21.879135', 'step': 160, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 160', 'timestamp': '2025-09-04 03:46:22.373385', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:22.469238', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.10933341830968857, 'timestamp': '2025-09-04 03:46:22.489584', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:46:22.572640', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.029027054086327553, 'timestamp': '2025-09-04 03:46:22.587595', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:22.688492', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.012869374826550484, 'timestamp': '2025-09-04 03:46:22.707182', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:46:22.789587', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.03408244624733925, 'timestamp': '2025-09-04 03:46:22.805117', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:22.897367', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.024805987253785133, 'timestamp': '2025-09-04 03:46:22.916271', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:46:23.006515', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.02760004997253418, 'timestamp': '2025-09-04 03:46:23.023015', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:46:23.115490', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.06649627536535263, 'timestamp': '2025-09-04 03:46:23.132373', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:23.236020', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.08248773962259293, 'timestamp': '2025-09-04 03:46:23.255843', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:23.364354', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.030205007642507553, 'timestamp': '2025-09-04 03:46:23.386069', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:23.480780', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.07514587044715881, 'timestamp': '2025-09-04 03:46:23.498073', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:46:23.576363', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.07391712814569473, 'timestamp': '2025-09-04 03:46:23.589072', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:23.689317', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.04588964581489563, 'timestamp': '2025-09-04 03:46:23.708742', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:23.787355', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.01995799131691456, 'timestamp': '2025-09-04 03:46:23.802655', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:23.914759', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.04095141217112541, 'timestamp': '2025-09-04 03:46:23.934809', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:24.040047', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.07867230474948883, 'timestamp': '2025-09-04 03:46:24.060041', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:46:24.144862', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.08477169275283813, 'timestamp': '2025-09-04 03:46:24.161034', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:24.258703', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.048271287232637405, 'timestamp': '2025-09-04 03:46:24.279194', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:24.380449', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.056753601878881454, 'timestamp': '2025-09-04 03:46:24.399137', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:46:24.481124', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.0256480872631073, 'timestamp': '2025-09-04 03:46:24.496079', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:24.605160', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.00964992307126522, 'timestamp': '2025-09-04 03:46:24.626113', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:46:32.989309', 'step': 180, 'epoch': 1} {'type': 'pplx', 'content': 304.3604161862801, 'timestamp': '2025-09-04 03:46:32.990886', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:33.063611', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.08969972282648087, 'timestamp': '2025-09-04 03:46:33.078807', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:33.186793', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.029566926881670952, 'timestamp': '2025-09-04 03:46:33.207034', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:33.309505', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.016738848760724068, 'timestamp': '2025-09-04 03:46:33.328509', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:33.404480', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.030050721019506454, 'timestamp': '2025-09-04 03:46:33.419146', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:46:33.538599', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.032734520733356476, 'timestamp': '2025-09-04 03:46:33.562256', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:33.660578', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.03777821734547615, 'timestamp': '2025-09-04 03:46:33.678971', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:33.785505', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.07701154798269272, 'timestamp': '2025-09-04 03:46:33.805660', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:33.911868', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.040181923657655716, 'timestamp': '2025-09-04 03:46:33.932359', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:34.013114', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.10895628482103348, 'timestamp': '2025-09-04 03:46:34.028150', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:46:34.111556', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.03597872704267502, 'timestamp': '2025-09-04 03:46:34.126652', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:46:34.223903', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.020103169605135918, 'timestamp': '2025-09-04 03:46:34.241188', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:34.350326', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.04040881618857384, 'timestamp': '2025-09-04 03:46:34.371538', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:46:34.478562', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.05105682834982872, 'timestamp': '2025-09-04 03:46:34.501219', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:34.606829', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.013259065337479115, 'timestamp': '2025-09-04 03:46:34.626713', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:34.721624', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.05866575241088867, 'timestamp': '2025-09-04 03:46:34.738863', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:34.815582', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.0314386822283268, 'timestamp': '2025-09-04 03:46:34.830274', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:34.928107', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.08559767156839371, 'timestamp': '2025-09-04 03:46:34.948472', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:35.041821', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.051514316350221634, 'timestamp': '2025-09-04 03:46:35.059202', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:46:35.153126', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.021353455260396004, 'timestamp': '2025-09-04 03:46:35.170412', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:46:35.280731', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.006292775738984346, 'timestamp': '2025-09-04 03:46:35.301953', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:46:43.677848', 'step': 200, 'epoch': 1} {'type': 'pplx', 'content': 308.7893902698641, 'timestamp': '2025-09-04 03:46:43.680096', 'step': 200, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 200', 'timestamp': '2025-09-04 03:46:44.033775', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:46:44.114791', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.13709776103496552, 'timestamp': '2025-09-04 03:46:44.131611', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:44.234364', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.06784407794475555, 'timestamp': '2025-09-04 03:46:44.253436', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:46:44.346048', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.023510560393333435, 'timestamp': '2025-09-04 03:46:44.363111', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:44.440942', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.1330084502696991, 'timestamp': '2025-09-04 03:46:44.455646', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:44.555190', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.02427174523472786, 'timestamp': '2025-09-04 03:46:44.576158', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:44.654034', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.08518130332231522, 'timestamp': '2025-09-04 03:46:44.667993', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:46:44.745924', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.028402844443917274, 'timestamp': '2025-09-04 03:46:44.759722', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:44.862696', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.022264117375016212, 'timestamp': '2025-09-04 03:46:44.882252', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:44.989666', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.014760280027985573, 'timestamp': '2025-09-04 03:46:45.011616', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:45.120759', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.02477847971022129, 'timestamp': '2025-09-04 03:46:45.140323', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:45.245385', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.09371557086706161, 'timestamp': '2025-09-04 03:46:45.264286', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:45.368282', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.040149930864572525, 'timestamp': '2025-09-04 03:46:45.387896', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:45.487708', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.05073068290948868, 'timestamp': '2025-09-04 03:46:45.508705', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 03:46:45.724051', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.03666497766971588, 'timestamp': '2025-09-04 03:46:45.762808', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:45.841828', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.025966059416532516, 'timestamp': '2025-09-04 03:46:45.855469', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:45.957760', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.04828720539808273, 'timestamp': '2025-09-04 03:46:45.976941', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:46.074755', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.008038034662604332, 'timestamp': '2025-09-04 03:46:46.094767', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:46.198305', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.02476082369685173, 'timestamp': '2025-09-04 03:46:46.217094', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:46.317114', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.03774585947394371, 'timestamp': '2025-09-04 03:46:46.335244', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:46:46.435893', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.04572395235300064, 'timestamp': '2025-09-04 03:46:46.454869', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:46:54.891309', 'step': 220, 'epoch': 1} {'type': 'pplx', 'content': 311.0440274571144, 'timestamp': '2025-09-04 03:46:54.893262', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:54.966356', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.020546726882457733, 'timestamp': '2025-09-04 03:46:54.981421', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:55.059442', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.047001611441373825, 'timestamp': '2025-09-04 03:46:55.073274', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:46:55.182713', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.04990854486823082, 'timestamp': '2025-09-04 03:46:55.203243', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:46:55.297066', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.03975335881114006, 'timestamp': '2025-09-04 03:46:55.314614', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:55.412711', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.037995487451553345, 'timestamp': '2025-09-04 03:46:55.433055', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:55.535515', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.06138302758336067, 'timestamp': '2025-09-04 03:46:55.554130', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 03:46:55.774170', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.04723235219717026, 'timestamp': '2025-09-04 03:46:55.816481', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:46:55.920763', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.013455665670335293, 'timestamp': '2025-09-04 03:46:55.940359', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:46:56.016328', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.05484043434262276, 'timestamp': '2025-09-04 03:46:56.031474', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1200], 'flops': 24000145761984.0}, 'timestamp': '2025-09-04 03:46:56.210233', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.011813382618129253, 'timestamp': '2025-09-04 03:46:56.242851', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:56.338311', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.04934141784906387, 'timestamp': '2025-09-04 03:46:56.355328', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:46:56.462670', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.01455477625131607, 'timestamp': '2025-09-04 03:46:56.483049', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:46:56.589711', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.035196904093027115, 'timestamp': '2025-09-04 03:46:56.611601', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:56.713067', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.1223868653178215, 'timestamp': '2025-09-04 03:46:56.731468', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:56.836088', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.023488562554121017, 'timestamp': '2025-09-04 03:46:56.855251', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:46:57.059604', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.041184525936841965, 'timestamp': '2025-09-04 03:46:57.099406', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:46:57.192703', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.04467172920703888, 'timestamp': '2025-09-04 03:46:57.211674', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:46:57.296253', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.058227408677339554, 'timestamp': '2025-09-04 03:46:57.311130', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:46:57.414555', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.024533184245228767, 'timestamp': '2025-09-04 03:46:57.433467', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:46:57.533042', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.03975701332092285, 'timestamp': '2025-09-04 03:46:57.552325', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:47:05.981740', 'step': 240, 'epoch': 1} {'type': 'pplx', 'content': 312.9286034141921, 'timestamp': '2025-09-04 03:47:05.983838', 'step': 240, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 240', 'timestamp': '2025-09-04 03:47:06.491752', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:06.593414', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.022641537711024284, 'timestamp': '2025-09-04 03:47:06.614328', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:47:06.718158', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.026233583688735962, 'timestamp': '2025-09-04 03:47:06.737180', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:47:06.859887', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.030290089547634125, 'timestamp': '2025-09-04 03:47:06.882829', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:06.989864', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.009676921181380749, 'timestamp': '2025-09-04 03:47:07.010467', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:47:07.118651', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.02796107716858387, 'timestamp': '2025-09-04 03:47:07.141141', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:07.251743', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.06316182762384415, 'timestamp': '2025-09-04 03:47:07.271865', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:47:07.349164', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.053157925605773926, 'timestamp': '2025-09-04 03:47:07.362748', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:47:07.501150', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.1000499501824379, 'timestamp': '2025-09-04 03:47:07.527516', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:47:07.603960', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.07537966966629028, 'timestamp': '2025-09-04 03:47:07.618912', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:07.723085', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.04253721982240677, 'timestamp': '2025-09-04 03:47:07.742175', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:07.843434', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.026042405515909195, 'timestamp': '2025-09-04 03:47:07.862102', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:07.971729', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.0640282928943634, 'timestamp': '2025-09-04 03:47:07.992076', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:08.094410', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.013043270446360111, 'timestamp': '2025-09-04 03:47:08.115388', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:08.219527', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.02681521698832512, 'timestamp': '2025-09-04 03:47:08.238620', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 03:47:08.416005', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.02740609645843506, 'timestamp': '2025-09-04 03:47:08.448567', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:47:08.536920', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.06523073464632034, 'timestamp': '2025-09-04 03:47:08.553113', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:08.645862', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.019541790708899498, 'timestamp': '2025-09-04 03:47:08.664680', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:08.758789', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.02935168892145157, 'timestamp': '2025-09-04 03:47:08.775713', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:08.870347', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.04195958375930786, 'timestamp': '2025-09-04 03:47:08.887314', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:08.989716', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.013827347196638584, 'timestamp': '2025-09-04 03:47:09.008620', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:47:17.480249', 'step': 260, 'epoch': 1} {'type': 'pplx', 'content': 313.3745874189351, 'timestamp': '2025-09-04 03:47:17.482337', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:17.585602', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.056827034801244736, 'timestamp': '2025-09-04 03:47:17.607847', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:17.707055', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.04780471324920654, 'timestamp': '2025-09-04 03:47:17.725413', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:17.824753', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.024920674040913582, 'timestamp': '2025-09-04 03:47:17.843217', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:47:17.953478', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.016696227714419365, 'timestamp': '2025-09-04 03:47:17.974500', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:18.065874', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.0172750111669302, 'timestamp': '2025-09-04 03:47:18.084464', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:47:18.194133', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.0052177682518959045, 'timestamp': '2025-09-04 03:47:18.214495', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:47:18.298475', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.03147877752780914, 'timestamp': '2025-09-04 03:47:18.313335', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:47:18.416050', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.03740302100777626, 'timestamp': '2025-09-04 03:47:18.435957', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:47:18.634340', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.08886415511369705, 'timestamp': '2025-09-04 03:47:18.677352', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:47:18.795835', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.024287065491080284, 'timestamp': '2025-09-04 03:47:18.818027', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:18.913367', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.05668797716498375, 'timestamp': '2025-09-04 03:47:18.930814', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:19.039604', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.05908737704157829, 'timestamp': '2025-09-04 03:47:19.060419', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:19.164051', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.010910317301750183, 'timestamp': '2025-09-04 03:47:19.185981', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:47:19.325988', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.019638676196336746, 'timestamp': '2025-09-04 03:47:19.351679', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:47:19.436571', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.07036858052015305, 'timestamp': '2025-09-04 03:47:19.451806', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:47:19.546137', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.026476135477423668, 'timestamp': '2025-09-04 03:47:19.563442', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:19.661217', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.0284750796854496, 'timestamp': '2025-09-04 03:47:19.681378', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:47:19.765056', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.10833613574504852, 'timestamp': '2025-09-04 03:47:19.780198', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:19.872507', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.030009621754288673, 'timestamp': '2025-09-04 03:47:19.889386', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:47:19.962887', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.05326032638549805, 'timestamp': '2025-09-04 03:47:19.976593', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:47:28.367516', 'step': 280, 'epoch': 1} {'type': 'pplx', 'content': 312.74428797361605, 'timestamp': '2025-09-04 03:47:28.370395', 'step': 280, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 280', 'timestamp': '2025-09-04 03:47:28.839161', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:47:28.912750', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.03643646836280823, 'timestamp': '2025-09-04 03:47:28.927377', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:47:29.006960', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.04280658811330795, 'timestamp': '2025-09-04 03:47:29.020511', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:29.117476', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.01976653002202511, 'timestamp': '2025-09-04 03:47:29.134794', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:29.230289', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.0358404815196991, 'timestamp': '2025-09-04 03:47:29.247810', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:29.346237', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.029284300282597542, 'timestamp': '2025-09-04 03:47:29.366289', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:47:29.485868', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.1099967360496521, 'timestamp': '2025-09-04 03:47:29.507498', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:29.604126', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.023914791643619537, 'timestamp': '2025-09-04 03:47:29.621110', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:47:29.760597', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.05426377058029175, 'timestamp': '2025-09-04 03:47:29.787210', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:29.879326', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.050082337111234665, 'timestamp': '2025-09-04 03:47:29.897720', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:29.994274', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.05403923988342285, 'timestamp': '2025-09-04 03:47:30.011501', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:30.111748', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.047084759920835495, 'timestamp': '2025-09-04 03:47:30.130634', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:47:30.210351', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.07940501719713211, 'timestamp': '2025-09-04 03:47:30.224933', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:30.323714', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.09987480938434601, 'timestamp': '2025-09-04 03:47:30.344265', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 03:47:30.419688', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.0861741453409195, 'timestamp': '2025-09-04 03:47:30.432119', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:30.541849', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.03038984164595604, 'timestamp': '2025-09-04 03:47:30.561788', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:30.663178', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.017293287441134453, 'timestamp': '2025-09-04 03:47:30.682096', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:47:30.766176', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.0584249384701252, 'timestamp': '2025-09-04 03:47:30.782879', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:30.878075', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.053005725145339966, 'timestamp': '2025-09-04 03:47:30.895153', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:30.996825', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.015453525818884373, 'timestamp': '2025-09-04 03:47:31.015301', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:47:31.125136', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.013833635486662388, 'timestamp': '2025-09-04 03:47:31.145828', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:47:39.537720', 'step': 300, 'epoch': 1} {'type': 'pplx', 'content': 311.2226619091774, 'timestamp': '2025-09-04 03:47:39.539677', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:39.635130', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.005779569037258625, 'timestamp': '2025-09-04 03:47:39.655327', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:47:39.731923', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.11063537001609802, 'timestamp': '2025-09-04 03:47:39.745638', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:39.839960', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.02686300128698349, 'timestamp': '2025-09-04 03:47:39.857135', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:39.963489', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.06041613593697548, 'timestamp': '2025-09-04 03:47:39.983975', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:40.083742', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.01706847734749317, 'timestamp': '2025-09-04 03:47:40.104803', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:40.210413', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.011196613311767578, 'timestamp': '2025-09-04 03:47:40.230267', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:47:40.320630', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.07578642666339874, 'timestamp': '2025-09-04 03:47:40.337139', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:40.430408', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.04545224457979202, 'timestamp': '2025-09-04 03:47:40.448605', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:40.549272', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.041280921548604965, 'timestamp': '2025-09-04 03:47:40.570265', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:47:40.668868', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.05615556985139847, 'timestamp': '2025-09-04 03:47:40.682155', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:47:40.785194', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.07136084884405136, 'timestamp': '2025-09-04 03:47:40.804167', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:40.914245', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.019209880381822586, 'timestamp': '2025-09-04 03:47:40.933587', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:47:41.019153', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.10823129117488861, 'timestamp': '2025-09-04 03:47:41.036119', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:41.135194', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.03277946263551712, 'timestamp': '2025-09-04 03:47:41.153529', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:47:41.237996', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.06067274883389473, 'timestamp': '2025-09-04 03:47:41.253446', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:47:41.381637', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.01747446320950985, 'timestamp': '2025-09-04 03:47:41.405328', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:41.496751', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.024807121604681015, 'timestamp': '2025-09-04 03:47:41.515635', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:41.620768', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.008701799437403679, 'timestamp': '2025-09-04 03:47:41.640483', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:47:41.746608', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.03150641545653343, 'timestamp': '2025-09-04 03:47:41.766485', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:47:41.902464', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.04005073383450508, 'timestamp': '2025-09-04 03:47:41.929101', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:47:50.311558', 'step': 320, 'epoch': 1} {'type': 'pplx', 'content': 309.1398029399646, 'timestamp': '2025-09-04 03:47:50.313320', 'step': 320, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 320', 'timestamp': '2025-09-04 03:47:50.768107', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:47:50.886087', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.024150997400283813, 'timestamp': '2025-09-04 03:47:50.911098', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:51.005388', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.013538923114538193, 'timestamp': '2025-09-04 03:47:51.022833', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:47:51.158175', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.03433951735496521, 'timestamp': '2025-09-04 03:47:51.183772', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:47:51.282848', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.04349237680435181, 'timestamp': '2025-09-04 03:47:51.301876', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:47:51.408869', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.03356247767806053, 'timestamp': '2025-09-04 03:47:51.431173', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:47:51.534373', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.04008670523762703, 'timestamp': '2025-09-04 03:47:51.553175', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:47:51.629153', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.007040772121399641, 'timestamp': '2025-09-04 03:47:51.642842', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:51.735567', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.03876568377017975, 'timestamp': '2025-09-04 03:47:51.753448', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:47:51.844898', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.03228950873017311, 'timestamp': '2025-09-04 03:47:51.863955', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:47:51.958554', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.09584397077560425, 'timestamp': '2025-09-04 03:47:51.975592', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:52.076487', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.05004340410232544, 'timestamp': '2025-09-04 03:47:52.095272', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:47:52.173583', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.03638507053256035, 'timestamp': '2025-09-04 03:47:52.188303', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:47:52.280377', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.05368280038237572, 'timestamp': '2025-09-04 03:47:52.299280', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:47:52.376428', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.05048099532723427, 'timestamp': '2025-09-04 03:47:52.390448', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:47:52.494290', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.0237380713224411, 'timestamp': '2025-09-04 03:47:52.513321', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:47:52.616448', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.014010374434292316, 'timestamp': '2025-09-04 03:47:52.636162', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:47:52.724698', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.052086420357227325, 'timestamp': '2025-09-04 03:47:52.743063', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:47:52.854009', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.04896777868270874, 'timestamp': '2025-09-04 03:47:52.874728', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:47:52.974955', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.06648825109004974, 'timestamp': '2025-09-04 03:47:52.993821', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:47:53.081148', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.09628743678331375, 'timestamp': '2025-09-04 03:47:53.097474', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:02.050472', 'step': 340, 'epoch': 1} {'type': 'pplx', 'content': 307.83904285220194, 'timestamp': '2025-09-04 03:48:02.053159', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:48:02.127903', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.027161644771695137, 'timestamp': '2025-09-04 03:48:02.142965', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:48:02.218754', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.0412270650267601, 'timestamp': '2025-09-04 03:48:02.232083', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:48:02.339531', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.058129601180553436, 'timestamp': '2025-09-04 03:48:02.359457', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:02.470920', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.04298894852399826, 'timestamp': '2025-09-04 03:48:02.492120', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:02.591173', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.04139872267842293, 'timestamp': '2025-09-04 03:48:02.611744', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:48:02.748595', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.036456745117902756, 'timestamp': '2025-09-04 03:48:02.774427', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:02.879028', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.09559651464223862, 'timestamp': '2025-09-04 03:48:02.898191', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:48:02.971130', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.036566995084285736, 'timestamp': '2025-09-04 03:48:02.984752', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:03.082772', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.04065980389714241, 'timestamp': '2025-09-04 03:48:03.103069', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:03.208487', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.05061135068535805, 'timestamp': '2025-09-04 03:48:03.227437', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:48:03.303490', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.016793936491012573, 'timestamp': '2025-09-04 03:48:03.317128', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:48:03.401892', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.02601003088057041, 'timestamp': '2025-09-04 03:48:03.417643', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:03.519469', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.02657604031264782, 'timestamp': '2025-09-04 03:48:03.540081', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:48:03.627443', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.02620949223637581, 'timestamp': '2025-09-04 03:48:03.643009', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:48:03.729847', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.05669962242245674, 'timestamp': '2025-09-04 03:48:03.745335', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:03.847153', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.028145290911197662, 'timestamp': '2025-09-04 03:48:03.866641', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:48:03.986575', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.0207599475979805, 'timestamp': '2025-09-04 03:48:04.011984', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:48:04.091711', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.030706727877259254, 'timestamp': '2025-09-04 03:48:04.105863', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:48:04.212114', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.010737722739577293, 'timestamp': '2025-09-04 03:48:04.232012', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 03:48:04.362847', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.029181912541389465, 'timestamp': '2025-09-04 03:48:04.387540', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:12.919636', 'step': 360, 'epoch': 1} {'type': 'pplx', 'content': 307.5586884862369, 'timestamp': '2025-09-04 03:48:12.935018', 'step': 360, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 360', 'timestamp': '2025-09-04 03:48:13.409472', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:48:13.515183', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.03834038972854614, 'timestamp': '2025-09-04 03:48:13.529742', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:48:13.687701', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.025468185544013977, 'timestamp': '2025-09-04 03:48:13.706710', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:13.836524', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.0641966164112091, 'timestamp': '2025-09-04 03:48:13.855536', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:48:13.996064', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.03254721313714981, 'timestamp': '2025-09-04 03:48:14.017115', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:48:14.164508', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.14557507634162903, 'timestamp': '2025-09-04 03:48:14.186890', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:14.293921', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.03721340000629425, 'timestamp': '2025-09-04 03:48:14.311196', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:48:14.437001', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.011114726774394512, 'timestamp': '2025-09-04 03:48:14.457608', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:48:14.600803', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.0390244759619236, 'timestamp': '2025-09-04 03:48:14.618979', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:48:14.746624', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.09442558139562607, 'timestamp': '2025-09-04 03:48:14.767824', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:14.883101', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.07992935180664062, 'timestamp': '2025-09-04 03:48:14.901810', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:15.024280', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.05484499782323837, 'timestamp': '2025-09-04 03:48:15.042914', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:48:15.147914', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.06619900465011597, 'timestamp': '2025-09-04 03:48:15.164438', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:48:15.250873', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.03658528998494148, 'timestamp': '2025-09-04 03:48:15.303114', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:15.433580', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.022451380267739296, 'timestamp': '2025-09-04 03:48:15.452866', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 03:48:15.615676', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.05221700668334961, 'timestamp': '2025-09-04 03:48:15.639715', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1040], 'flops': 20800126336064.0}, 'timestamp': '2025-09-04 03:48:15.819023', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.09046898037195206, 'timestamp': '2025-09-04 03:48:15.849378', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:48:15.998775', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.07325262576341629, 'timestamp': '2025-09-04 03:48:16.023460', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:48:16.119471', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.042737096548080444, 'timestamp': '2025-09-04 03:48:16.133559', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:16.283766', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.02026101015508175, 'timestamp': '2025-09-04 03:48:16.304404', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:16.427741', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.011748573742806911, 'timestamp': '2025-09-04 03:48:16.445240', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:25.287729', 'step': 380, 'epoch': 1} {'type': 'pplx', 'content': 311.8221380685682, 'timestamp': '2025-09-04 03:48:25.295794', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:25.392305', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.013346564956009388, 'timestamp': '2025-09-04 03:48:25.410543', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:25.535167', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.013797925785183907, 'timestamp': '2025-09-04 03:48:25.555859', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:25.681692', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.015700260177254677, 'timestamp': '2025-09-04 03:48:25.701141', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:25.835149', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.024386491626501083, 'timestamp': '2025-09-04 03:48:25.859596', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:25.987712', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.016429277136921883, 'timestamp': '2025-09-04 03:48:26.006472', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:48:26.122316', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.011799799278378487, 'timestamp': '2025-09-04 03:48:26.141876', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:26.248691', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.06375425308942795, 'timestamp': '2025-09-04 03:48:26.265309', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:48:26.346698', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.035253461450338364, 'timestamp': '2025-09-04 03:48:26.361086', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:26.473191', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.05822211131453514, 'timestamp': '2025-09-04 03:48:26.494275', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:48:26.575019', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.035860493779182434, 'timestamp': '2025-09-04 03:48:26.588671', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:48:26.674728', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.025279760360717773, 'timestamp': '2025-09-04 03:48:26.688319', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:48:26.776869', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.022274592891335487, 'timestamp': '2025-09-04 03:48:26.792909', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:26.892516', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.026588624343276024, 'timestamp': '2025-09-04 03:48:26.912495', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:27.007916', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.0037299662362784147, 'timestamp': '2025-09-04 03:48:27.024992', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:27.130031', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.016043562442064285, 'timestamp': '2025-09-04 03:48:27.149089', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:48:27.261367', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.022281493991613388, 'timestamp': '2025-09-04 03:48:27.282593', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:27.381479', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.019878646358847618, 'timestamp': '2025-09-04 03:48:27.401844', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:27.493393', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.06282302737236023, 'timestamp': '2025-09-04 03:48:27.510053', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:27.627066', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.016571981832385063, 'timestamp': '2025-09-04 03:48:27.646198', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:27.748235', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.05791422352194786, 'timestamp': '2025-09-04 03:48:27.767514', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:36.321805', 'step': 400, 'epoch': 1} {'type': 'pplx', 'content': 316.7635327191992, 'timestamp': '2025-09-04 03:48:36.324054', 'step': 400, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 400', 'timestamp': '2025-09-04 03:48:36.675112', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:48:36.774708', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.022503888234496117, 'timestamp': '2025-09-04 03:48:36.795589', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:36.889466', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.021656574681401253, 'timestamp': '2025-09-04 03:48:36.906649', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:48:36.992132', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.036612629890441895, 'timestamp': '2025-09-04 03:48:37.007536', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:48:37.081308', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.015442884527146816, 'timestamp': '2025-09-04 03:48:37.094723', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:37.185279', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.02640901878476143, 'timestamp': '2025-09-04 03:48:37.203874', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:48:37.290602', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.05808902904391289, 'timestamp': '2025-09-04 03:48:37.305715', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:37.401155', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.02911917120218277, 'timestamp': '2025-09-04 03:48:37.418222', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:48:37.494702', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.012547432444989681, 'timestamp': '2025-09-04 03:48:37.508723', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:37.615037', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.029442116618156433, 'timestamp': '2025-09-04 03:48:37.637295', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:48:37.743694', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.017742721363902092, 'timestamp': '2025-09-04 03:48:37.763621', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:37.854723', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.13883210718631744, 'timestamp': '2025-09-04 03:48:37.871242', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:37.964918', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.06384449452161789, 'timestamp': '2025-09-04 03:48:37.982828', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:48:38.096961', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.021444780752062798, 'timestamp': '2025-09-04 03:48:38.120984', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:38.213825', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.023383496329188347, 'timestamp': '2025-09-04 03:48:38.230925', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:38.334573', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.011450543999671936, 'timestamp': '2025-09-04 03:48:38.353477', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:48:38.430473', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.03750946745276451, 'timestamp': '2025-09-04 03:48:38.445210', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:38.543176', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.011969326063990593, 'timestamp': '2025-09-04 03:48:38.563556', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:48:38.639745', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.07576386630535126, 'timestamp': '2025-09-04 03:48:38.653215', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:48:38.787749', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.009221615269780159, 'timestamp': '2025-09-04 03:48:38.813311', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:38.913752', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.020009953528642654, 'timestamp': '2025-09-04 03:48:38.932761', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:47.342665', 'step': 420, 'epoch': 1} {'type': 'pplx', 'content': 319.4097698572594, 'timestamp': '2025-09-04 03:48:47.344661', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:47.449253', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.053698133677244186, 'timestamp': '2025-09-04 03:48:47.471467', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:47.581639', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.009336840361356735, 'timestamp': '2025-09-04 03:48:47.602046', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:48:47.722268', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.017453299835324287, 'timestamp': '2025-09-04 03:48:47.744067', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:48:47.827080', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.03400561586022377, 'timestamp': '2025-09-04 03:48:47.842904', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:48:47.936119', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.012152721174061298, 'timestamp': '2025-09-04 03:48:47.955176', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:48.055267', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.052000127732753754, 'timestamp': '2025-09-04 03:48:48.073372', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:48.167311', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.01812596619129181, 'timestamp': '2025-09-04 03:48:48.184092', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:48:48.295139', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.038504090160131454, 'timestamp': '2025-09-04 03:48:48.316458', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:48.416934', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.07306548207998276, 'timestamp': '2025-09-04 03:48:48.438023', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:48:48.533780', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.08940979093313217, 'timestamp': '2025-09-04 03:48:48.551198', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:48.647281', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.01325355563312769, 'timestamp': '2025-09-04 03:48:48.664338', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:48:48.759489', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.046491898596286774, 'timestamp': '2025-09-04 03:48:48.777517', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:48:48.883612', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.12319537252187729, 'timestamp': '2025-09-04 03:48:48.905497', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:49.016164', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.0106568094342947, 'timestamp': '2025-09-04 03:48:49.036142', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:48:49.136703', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.009872270748019218, 'timestamp': '2025-09-04 03:48:49.155241', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:49.256981', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.016583112999796867, 'timestamp': '2025-09-04 03:48:49.275694', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:48:49.360494', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.008935445919632912, 'timestamp': '2025-09-04 03:48:49.376832', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:49.468735', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.03650280088186264, 'timestamp': '2025-09-04 03:48:49.485004', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:49.581069', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.017082802951335907, 'timestamp': '2025-09-04 03:48:49.598116', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:49.702688', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.007683966308832169, 'timestamp': '2025-09-04 03:48:49.722439', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:48:58.199774', 'step': 440, 'epoch': 1} {'type': 'pplx', 'content': 321.40916814073927, 'timestamp': '2025-09-04 03:48:58.202230', 'step': 440, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 440', 'timestamp': '2025-09-04 03:48:58.565856', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:48:58.664276', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.023604106158018112, 'timestamp': '2025-09-04 03:48:58.684199', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 03:48:58.904564', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.012493900023400784, 'timestamp': '2025-09-04 03:48:58.946767', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:48:59.054947', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.027849415317177773, 'timestamp': '2025-09-04 03:48:59.074061', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:48:59.168688', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.027136214077472687, 'timestamp': '2025-09-04 03:48:59.186008', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:48:59.260823', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.06847383081912994, 'timestamp': '2025-09-04 03:48:59.275370', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:48:59.387276', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.006238785106688738, 'timestamp': '2025-09-04 03:48:59.407172', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:48:59.495549', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.060996416956186295, 'timestamp': '2025-09-04 03:48:59.510441', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:48:59.602462', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.04064783453941345, 'timestamp': '2025-09-04 03:48:59.619374', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:48:59.724508', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.08736442029476166, 'timestamp': '2025-09-04 03:48:59.745799', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:48:59.850860', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.0009415106615051627, 'timestamp': '2025-09-04 03:48:59.869303', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:48:59.964786', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.043740540742874146, 'timestamp': '2025-09-04 03:48:59.981465', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:00.076418', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.015802182257175446, 'timestamp': '2025-09-04 03:49:00.093653', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:00.195759', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.07930180430412292, 'timestamp': '2025-09-04 03:49:00.216444', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:49:00.312721', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.01712995208799839, 'timestamp': '2025-09-04 03:49:00.329671', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:49:00.439710', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.06034353747963905, 'timestamp': '2025-09-04 03:49:00.460043', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:00.552284', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.008295533247292042, 'timestamp': '2025-09-04 03:49:00.569255', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:49:00.653046', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.019054660573601723, 'timestamp': '2025-09-04 03:49:00.669066', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:49:00.767740', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.01991303637623787, 'timestamp': '2025-09-04 03:49:00.784981', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:00.886589', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.033343605697155, 'timestamp': '2025-09-04 03:49:00.904779', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:01.005430', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.07760113477706909, 'timestamp': '2025-09-04 03:49:01.024576', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:49:09.462043', 'step': 460, 'epoch': 1} {'type': 'pplx', 'content': 324.06946329758586, 'timestamp': '2025-09-04 03:49:09.463792', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:49:09.541744', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.04705421254038811, 'timestamp': '2025-09-04 03:49:09.558323', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:09.663299', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.004760258831083775, 'timestamp': '2025-09-04 03:49:09.683378', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:09.775625', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.02817857638001442, 'timestamp': '2025-09-04 03:49:09.792725', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:09.895742', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.04996614158153534, 'timestamp': '2025-09-04 03:49:09.915742', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:10.013902', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.03526470810174942, 'timestamp': '2025-09-04 03:49:10.034308', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:49:10.118179', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.061329569667577744, 'timestamp': '2025-09-04 03:49:10.133268', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:49:10.335498', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.013789204880595207, 'timestamp': '2025-09-04 03:49:10.374743', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:49:10.485354', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.049680087715387344, 'timestamp': '2025-09-04 03:49:10.506771', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:49:10.581897', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.05645358934998512, 'timestamp': '2025-09-04 03:49:10.597034', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:49:10.706775', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.01656719297170639, 'timestamp': '2025-09-04 03:49:10.727119', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:49:10.805413', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.01675380952656269, 'timestamp': '2025-09-04 03:49:10.819234', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:49:10.891804', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.005480926018208265, 'timestamp': '2025-09-04 03:49:10.905273', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:49:11.023420', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.037396758794784546, 'timestamp': '2025-09-04 03:49:11.048758', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:49:11.126774', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.029778504744172096, 'timestamp': '2025-09-04 03:49:11.140580', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 03:49:11.277106', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.0628209188580513, 'timestamp': '2025-09-04 03:49:11.301517', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:11.400955', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.02091328613460064, 'timestamp': '2025-09-04 03:49:11.420085', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:11.510953', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.060857050120830536, 'timestamp': '2025-09-04 03:49:11.529984', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:11.623012', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.03812684863805771, 'timestamp': '2025-09-04 03:49:11.639932', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:49:11.725628', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.09244571626186371, 'timestamp': '2025-09-04 03:49:11.741008', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:11.833497', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.05786127969622612, 'timestamp': '2025-09-04 03:49:11.851190', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:49:20.202978', 'step': 480, 'epoch': 1} {'type': 'pplx', 'content': 328.5239349757793, 'timestamp': '2025-09-04 03:49:20.204938', 'step': 480, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 480', 'timestamp': '2025-09-04 03:49:20.553378', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:49:20.657173', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.047159433364868164, 'timestamp': '2025-09-04 03:49:20.679232', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:20.773246', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.07475479692220688, 'timestamp': '2025-09-04 03:49:20.790699', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:20.893431', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.012311739847064018, 'timestamp': '2025-09-04 03:49:20.912687', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:49:21.022565', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.011123532429337502, 'timestamp': '2025-09-04 03:49:21.043679', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:49:21.148538', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.06513547897338867, 'timestamp': '2025-09-04 03:49:21.170605', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:21.276579', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.019697437062859535, 'timestamp': '2025-09-04 03:49:21.296331', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:21.403122', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.06598882377147675, 'timestamp': '2025-09-04 03:49:21.422840', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:21.529193', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.0512818843126297, 'timestamp': '2025-09-04 03:49:21.549823', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:49:21.624011', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.021413128823041916, 'timestamp': '2025-09-04 03:49:21.638832', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:49:21.723109', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.018553882837295532, 'timestamp': '2025-09-04 03:49:21.738095', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:49:21.839249', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.11178267747163773, 'timestamp': '2025-09-04 03:49:21.856511', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:49:21.933185', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.07693499326705933, 'timestamp': '2025-09-04 03:49:21.947886', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:22.035794', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.03976715728640556, 'timestamp': '2025-09-04 03:49:22.053973', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:49:22.137152', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.025103233754634857, 'timestamp': '2025-09-04 03:49:22.152121', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:49:22.259839', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.011973893269896507, 'timestamp': '2025-09-04 03:49:22.279867', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:22.378194', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.08681105077266693, 'timestamp': '2025-09-04 03:49:22.397335', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:49:22.471677', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.0133007001131773, 'timestamp': '2025-09-04 03:49:22.486802', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:49:22.562649', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.05417775362730026, 'timestamp': '2025-09-04 03:49:22.576169', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:49:22.652206', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.04143834114074707, 'timestamp': '2025-09-04 03:49:22.665755', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:49:22.749786', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.021494613960385323, 'timestamp': '2025-09-04 03:49:22.765498', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:49:31.153962', 'step': 500, 'epoch': 1} {'type': 'pplx', 'content': 332.01159701525927, 'timestamp': '2025-09-04 03:49:31.156242', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:49:31.271328', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.057877492159605026, 'timestamp': '2025-09-04 03:49:31.295176', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:31.398872', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.023877454921603203, 'timestamp': '2025-09-04 03:49:31.418005', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:31.518096', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.017220618203282356, 'timestamp': '2025-09-04 03:49:31.536414', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:49:31.647385', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.03338051587343216, 'timestamp': '2025-09-04 03:49:31.668588', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:31.760724', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.008282708935439587, 'timestamp': '2025-09-04 03:49:31.779547', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:49:31.854732', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.027649713680148125, 'timestamp': '2025-09-04 03:49:31.868026', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:49:31.978236', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.07461496442556381, 'timestamp': '2025-09-04 03:49:31.998627', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:32.101885', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.034678079187870026, 'timestamp': '2025-09-04 03:49:32.121824', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:32.225900', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.017775679007172585, 'timestamp': '2025-09-04 03:49:32.247614', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:49:32.324639', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.021030427888035774, 'timestamp': '2025-09-04 03:49:32.337950', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:32.431041', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.0012499457225203514, 'timestamp': '2025-09-04 03:49:32.448340', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:32.550823', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.05383225530385971, 'timestamp': '2025-09-04 03:49:32.570216', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:32.673612', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.09943293035030365, 'timestamp': '2025-09-04 03:49:32.695354', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:49:32.804728', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.067501500248909, 'timestamp': '2025-09-04 03:49:32.825120', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:49:32.897035', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.056289635598659515, 'timestamp': '2025-09-04 03:49:32.909856', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:33.004917', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.020486541092395782, 'timestamp': '2025-09-04 03:49:33.022917', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:33.113978', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.09998507797718048, 'timestamp': '2025-09-04 03:49:33.132916', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:49:33.234964', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.018785972148180008, 'timestamp': '2025-09-04 03:49:33.254022', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:33.346763', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.05426869913935661, 'timestamp': '2025-09-04 03:49:33.363701', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:49:33.458619', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.0028427981305867434, 'timestamp': '2025-09-04 03:49:33.476697', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:49:41.838088', 'step': 520, 'epoch': 1} {'type': 'pplx', 'content': 331.47364260793984, 'timestamp': '2025-09-04 03:49:41.839937', 'step': 520, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 520', 'timestamp': '2025-09-04 03:49:42.197749', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:49:42.304263', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.004516193643212318, 'timestamp': '2025-09-04 03:49:42.326503', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:42.420198', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.029628755524754524, 'timestamp': '2025-09-04 03:49:42.437520', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:42.537022', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.016332386061549187, 'timestamp': '2025-09-04 03:49:42.555319', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:49:42.657450', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.12754516303539276, 'timestamp': '2025-09-04 03:49:42.677064', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:42.767235', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.0154288774356246, 'timestamp': '2025-09-04 03:49:42.785875', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:42.888440', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.04728075489401817, 'timestamp': '2025-09-04 03:49:42.907264', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 03:49:42.974277', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.039872948080301285, 'timestamp': '2025-09-04 03:49:42.985085', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:49:43.103378', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.15819469094276428, 'timestamp': '2025-09-04 03:49:43.126041', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:43.214657', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.00868944637477398, 'timestamp': '2025-09-04 03:49:43.232835', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:49:43.310728', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.043173011392354965, 'timestamp': '2025-09-04 03:49:43.324708', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:43.424559', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.009749419055879116, 'timestamp': '2025-09-04 03:49:43.443157', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:43.542929', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.060370367020368576, 'timestamp': '2025-09-04 03:49:43.562461', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:49:43.646414', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.05955101177096367, 'timestamp': '2025-09-04 03:49:43.663498', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:49:43.769394', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.011455871164798737, 'timestamp': '2025-09-04 03:49:43.789213', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:43.883263', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.017603786662220955, 'timestamp': '2025-09-04 03:49:43.900564', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:43.999470', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.004019308835268021, 'timestamp': '2025-09-04 03:49:44.018774', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:44.108934', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.03117678314447403, 'timestamp': '2025-09-04 03:49:44.127529', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:49:44.228887', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.02788373827934265, 'timestamp': '2025-09-04 03:49:44.247823', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:49:44.323710', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.023979298770427704, 'timestamp': '2025-09-04 03:49:44.337235', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:44.427420', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.0515667125582695, 'timestamp': '2025-09-04 03:49:44.444749', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:49:52.800904', 'step': 540, 'epoch': 1} {'type': 'pplx', 'content': 328.1148207360572, 'timestamp': '2025-09-04 03:49:52.802715', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:49:52.884845', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.034760650247335434, 'timestamp': '2025-09-04 03:49:52.901766', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:49:53.030246', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.061195723712444305, 'timestamp': '2025-09-04 03:49:53.053121', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:53.147174', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.04390040412545204, 'timestamp': '2025-09-04 03:49:53.164465', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:49:53.258655', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.01793227717280388, 'timestamp': '2025-09-04 03:49:53.276613', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:49:53.414366', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.0374876894056797, 'timestamp': '2025-09-04 03:49:53.442685', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:53.543275', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.08235947042703629, 'timestamp': '2025-09-04 03:49:53.561611', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:53.656351', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.04841390252113342, 'timestamp': '2025-09-04 03:49:53.673261', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:49:53.781485', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.01317357737571001, 'timestamp': '2025-09-04 03:49:53.802283', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:53.893276', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.016209116205573082, 'timestamp': '2025-09-04 03:49:53.911823', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:49:54.016291', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.08481376618146896, 'timestamp': '2025-09-04 03:49:54.035369', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:49:54.144944', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.045819301158189774, 'timestamp': '2025-09-04 03:49:54.165459', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:54.256078', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.02893437258899212, 'timestamp': '2025-09-04 03:49:54.273632', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:49:54.393325', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.0218905508518219, 'timestamp': '2025-09-04 03:49:54.418542', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:54.511444', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.03285536542534828, 'timestamp': '2025-09-04 03:49:54.528330', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:49:54.630178', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.008444556035101414, 'timestamp': '2025-09-04 03:49:54.649073', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:49:54.740380', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.04841528460383415, 'timestamp': '2025-09-04 03:49:54.757766', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:49:54.848997', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.02788812294602394, 'timestamp': '2025-09-04 03:49:54.867556', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:49:54.942951', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.03387841582298279, 'timestamp': '2025-09-04 03:49:54.956225', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:49:55.056086', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.038461048156023026, 'timestamp': '2025-09-04 03:49:55.074363', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:49:55.161963', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.025093531236052513, 'timestamp': '2025-09-04 03:49:55.177890', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:03.607868', 'step': 560, 'epoch': 1} {'type': 'pplx', 'content': 323.1010053666772, 'timestamp': '2025-09-04 03:50:03.610489', 'step': 560, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 560', 'timestamp': '2025-09-04 03:50:04.111690', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:50:04.194180', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.02403130754828453, 'timestamp': '2025-09-04 03:50:04.210903', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:04.300277', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.010109679773449898, 'timestamp': '2025-09-04 03:50:04.316739', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:04.418541', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.053742896765470505, 'timestamp': '2025-09-04 03:50:04.436868', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:50:04.547209', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.025198856368660927, 'timestamp': '2025-09-04 03:50:04.568436', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:04.655610', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.04810674116015434, 'timestamp': '2025-09-04 03:50:04.673889', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:50:04.782853', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.01729135774075985, 'timestamp': '2025-09-04 03:50:04.802978', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:04.912535', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.03990844264626503, 'timestamp': '2025-09-04 03:50:04.932830', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:50:05.016165', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.018527284264564514, 'timestamp': '2025-09-04 03:50:05.031879', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:05.123611', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.04572749882936478, 'timestamp': '2025-09-04 03:50:05.142464', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:05.251316', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.06394918262958527, 'timestamp': '2025-09-04 03:50:05.271592', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:05.365527', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.023581665009260178, 'timestamp': '2025-09-04 03:50:05.382808', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:05.485680', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.026009058579802513, 'timestamp': '2025-09-04 03:50:05.505623', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:50:05.612207', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.011551128700375557, 'timestamp': '2025-09-04 03:50:05.634936', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:05.737394', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.0341804064810276, 'timestamp': '2025-09-04 03:50:05.756726', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:50:05.849571', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.09191818535327911, 'timestamp': '2025-09-04 03:50:05.866319', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:05.966270', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.008478672243654728, 'timestamp': '2025-09-04 03:50:05.985773', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:06.086160', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.10518831759691238, 'timestamp': '2025-09-04 03:50:06.107230', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:06.205533', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.0350443534553051, 'timestamp': '2025-09-04 03:50:06.223903', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:50:06.295644', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.03610123321413994, 'timestamp': '2025-09-04 03:50:06.308378', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:06.402413', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.07018542289733887, 'timestamp': '2025-09-04 03:50:06.420355', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:14.881907', 'step': 580, 'epoch': 1} {'type': 'pplx', 'content': 323.03369944740126, 'timestamp': '2025-09-04 03:50:14.884156', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:14.980302', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.009829767979681492, 'timestamp': '2025-09-04 03:50:15.000981', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 03:50:15.073314', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.01034059002995491, 'timestamp': '2025-09-04 03:50:15.085969', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:15.182867', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.023535916581749916, 'timestamp': '2025-09-04 03:50:15.200207', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:50:15.306387', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.048321448266506195, 'timestamp': '2025-09-04 03:50:15.326896', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:15.426730', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.007726522162556648, 'timestamp': '2025-09-04 03:50:15.447627', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:50:15.525155', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.10942022502422333, 'timestamp': '2025-09-04 03:50:15.538987', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:50:15.645303', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.00408227788284421, 'timestamp': '2025-09-04 03:50:15.665165', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:15.767654', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.013964972458779812, 'timestamp': '2025-09-04 03:50:15.787665', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:15.894691', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.061664290726184845, 'timestamp': '2025-09-04 03:50:15.916999', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:50:16.000197', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.08124200999736786, 'timestamp': '2025-09-04 03:50:16.015088', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:50:16.098629', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.012426053173840046, 'timestamp': '2025-09-04 03:50:16.113698', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:16.213709', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.11045590788125992, 'timestamp': '2025-09-04 03:50:16.233343', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:16.333263', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.03249969705939293, 'timestamp': '2025-09-04 03:50:16.354179', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:50:16.464080', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.023777302354574203, 'timestamp': '2025-09-04 03:50:16.484351', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:16.593894', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.017426855862140656, 'timestamp': '2025-09-04 03:50:16.614358', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:16.715910', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.032828718423843384, 'timestamp': '2025-09-04 03:50:16.735355', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:16.826539', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.08800698816776276, 'timestamp': '2025-09-04 03:50:16.845406', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:16.939524', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.05674955993890762, 'timestamp': '2025-09-04 03:50:16.956622', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:17.057539', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.02611662819981575, 'timestamp': '2025-09-04 03:50:17.076255', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:50:17.154424', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.037351664155721664, 'timestamp': '2025-09-04 03:50:17.169108', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:25.545854', 'step': 600, 'epoch': 1} {'type': 'pplx', 'content': 324.7015294662687, 'timestamp': '2025-09-04 03:50:25.548258', 'step': 600, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 600', 'timestamp': '2025-09-04 03:50:25.891967', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:50:25.963785', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.05272102355957031, 'timestamp': '2025-09-04 03:50:25.978431', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:26.080090', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.013583468273282051, 'timestamp': '2025-09-04 03:50:26.099071', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:26.197206', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.021021874621510506, 'timestamp': '2025-09-04 03:50:26.215799', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:26.310990', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.012546015903353691, 'timestamp': '2025-09-04 03:50:26.329269', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:26.429291', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.04589983820915222, 'timestamp': '2025-09-04 03:50:26.450420', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:26.545770', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.04309983178973198, 'timestamp': '2025-09-04 03:50:26.563268', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:50:26.648264', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.06627572327852249, 'timestamp': '2025-09-04 03:50:26.663786', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:26.758202', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.02102210372686386, 'timestamp': '2025-09-04 03:50:26.776450', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:50:26.892497', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.04680265486240387, 'timestamp': '2025-09-04 03:50:26.916116', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:27.019321', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.05080176889896393, 'timestamp': '2025-09-04 03:50:27.038659', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:27.138655', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.02795407734811306, 'timestamp': '2025-09-04 03:50:27.157004', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:50:27.231965', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.025690896436572075, 'timestamp': '2025-09-04 03:50:27.246524', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:50:27.326510', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.011252271011471748, 'timestamp': '2025-09-04 03:50:27.343040', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:50:27.426099', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.029573671519756317, 'timestamp': '2025-09-04 03:50:27.440961', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:27.542987', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.03328055888414383, 'timestamp': '2025-09-04 03:50:27.562129', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:27.656249', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.03198854997754097, 'timestamp': '2025-09-04 03:50:27.673816', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:27.775050', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.05293620377779007, 'timestamp': '2025-09-04 03:50:27.795724', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:50:27.918294', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.030497848987579346, 'timestamp': '2025-09-04 03:50:27.941330', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:28.055770', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.003063701558858156, 'timestamp': '2025-09-04 03:50:28.074792', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:50:28.196976', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.00672591058537364, 'timestamp': '2025-09-04 03:50:28.220908', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:36.596005', 'step': 620, 'epoch': 1} {'type': 'pplx', 'content': 330.22229839916525, 'timestamp': '2025-09-04 03:50:36.598510', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:50:36.680013', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.07592124491930008, 'timestamp': '2025-09-04 03:50:36.697115', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:36.797854', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.03790687769651413, 'timestamp': '2025-09-04 03:50:36.816637', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:50:36.894069', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.05864779278635979, 'timestamp': '2025-09-04 03:50:36.907933', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:50:37.000229', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.04656538739800453, 'timestamp': '2025-09-04 03:50:37.017840', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:37.106987', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.08541495352983475, 'timestamp': '2025-09-04 03:50:37.125351', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:37.227295', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.019130367785692215, 'timestamp': '2025-09-04 03:50:37.245938', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:37.339064', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.030174510553479195, 'timestamp': '2025-09-04 03:50:37.356225', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:37.451088', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.012393724173307419, 'timestamp': '2025-09-04 03:50:37.469176', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:37.557289', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.03318622708320618, 'timestamp': '2025-09-04 03:50:37.575472', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:50:37.652296', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.041939180344343185, 'timestamp': '2025-09-04 03:50:37.666329', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:37.759762', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.01324822474271059, 'timestamp': '2025-09-04 03:50:37.777018', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:50:37.855647', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.011515927501022816, 'timestamp': '2025-09-04 03:50:37.870228', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:50:37.960942', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.03529660031199455, 'timestamp': '2025-09-04 03:50:37.979640', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:38.081658', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.038785651326179504, 'timestamp': '2025-09-04 03:50:38.100818', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 03:50:38.239722', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.014862718991935253, 'timestamp': '2025-09-04 03:50:38.265727', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:38.366128', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.054356202483177185, 'timestamp': '2025-09-04 03:50:38.385549', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:38.485770', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.12962757050991058, 'timestamp': '2025-09-04 03:50:38.506762', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:38.617492', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.04526481032371521, 'timestamp': '2025-09-04 03:50:38.635828', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:38.739668', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.022485429421067238, 'timestamp': '2025-09-04 03:50:38.758702', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:38.861201', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.01773577369749546, 'timestamp': '2025-09-04 03:50:38.880872', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:47.264720', 'step': 640, 'epoch': 1} {'type': 'pplx', 'content': 331.5543720069065, 'timestamp': '2025-09-04 03:50:47.267008', 'step': 640, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 640', 'timestamp': '2025-09-04 03:50:47.629883', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 03:50:47.702312', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.018003080040216446, 'timestamp': '2025-09-04 03:50:47.714561', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:50:47.791539', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.01093088649213314, 'timestamp': '2025-09-04 03:50:47.805588', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:47.905150', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.02146296203136444, 'timestamp': '2025-09-04 03:50:47.923461', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:48.024088', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.04993215575814247, 'timestamp': '2025-09-04 03:50:48.043552', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:50:48.143506', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.08081956207752228, 'timestamp': '2025-09-04 03:50:48.164354', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:48.277592', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.018953487277030945, 'timestamp': '2025-09-04 03:50:48.297957', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:48.396340', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.024002674967050552, 'timestamp': '2025-09-04 03:50:48.413527', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:50:48.521196', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.004336449783295393, 'timestamp': '2025-09-04 03:50:48.542030', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:48.633156', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.0077681634575128555, 'timestamp': '2025-09-04 03:50:48.652155', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:48.745478', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.03083970956504345, 'timestamp': '2025-09-04 03:50:48.762789', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:48.872150', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.012532150372862816, 'timestamp': '2025-09-04 03:50:48.892489', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:48.993220', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.060628145933151245, 'timestamp': '2025-09-04 03:50:49.012649', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:49.103744', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.008690881542861462, 'timestamp': '2025-09-04 03:50:49.122962', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:50:49.223690', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.014455270953476429, 'timestamp': '2025-09-04 03:50:49.242713', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:49.336934', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.013105835765600204, 'timestamp': '2025-09-04 03:50:49.354425', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:50:49.439534', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.07032399624586105, 'timestamp': '2025-09-04 03:50:49.455865', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:50:49.528709', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.07407846301794052, 'timestamp': '2025-09-04 03:50:49.543655', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:50:49.651803', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.009337909519672394, 'timestamp': '2025-09-04 03:50:49.672288', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:49.766720', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.017982447519898415, 'timestamp': '2025-09-04 03:50:49.784007', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:50:49.889673', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.009215055964887142, 'timestamp': '2025-09-04 03:50:49.910355', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:50:58.286370', 'step': 660, 'epoch': 1} {'type': 'pplx', 'content': 330.0641296778058, 'timestamp': '2025-09-04 03:50:58.288230', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:50:58.386848', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.02920273132622242, 'timestamp': '2025-09-04 03:50:58.408025', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:50:58.498758', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.007981177419424057, 'timestamp': '2025-09-04 03:50:58.515280', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:58.615295', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.017763612791895866, 'timestamp': '2025-09-04 03:50:58.633595', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:50:58.734437', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.013931555673480034, 'timestamp': '2025-09-04 03:50:58.753624', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:50:58.858226', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.013805502094328403, 'timestamp': '2025-09-04 03:50:58.880186', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:50:58.975496', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.040271319448947906, 'timestamp': '2025-09-04 03:50:58.992762', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:50:59.088259', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.008608100935816765, 'timestamp': '2025-09-04 03:50:59.105528', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:50:59.215099', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.018997695297002792, 'timestamp': '2025-09-04 03:50:59.235830', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:50:59.340368', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.04310667887330055, 'timestamp': '2025-09-04 03:50:59.362359', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:50:59.436010', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.01862409897148609, 'timestamp': '2025-09-04 03:50:59.448669', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:50:59.533601', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.032819997519254684, 'timestamp': '2025-09-04 03:50:59.548822', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:50:59.655577', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.024017833173274994, 'timestamp': '2025-09-04 03:50:59.676063', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:50:59.767083', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.026013823226094246, 'timestamp': '2025-09-04 03:50:59.785657', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:50:59.879185', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.0160057432949543, 'timestamp': '2025-09-04 03:50:59.896099', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:00.000068', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.016003988683223724, 'timestamp': '2025-09-04 03:51:00.019146', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:51:00.115125', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.021060237661004066, 'timestamp': '2025-09-04 03:51:00.133211', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:00.236367', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.05808281898498535, 'timestamp': '2025-09-04 03:51:00.258278', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 03:51:00.421947', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.013201756402850151, 'timestamp': '2025-09-04 03:51:00.453778', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:51:00.538723', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.02031007781624794, 'timestamp': '2025-09-04 03:51:00.553892', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:00.657200', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.034204911440610886, 'timestamp': '2025-09-04 03:51:00.676840', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:51:09.031779', 'step': 680, 'epoch': 1} {'type': 'pplx', 'content': 330.2300531347332, 'timestamp': '2025-09-04 03:51:09.033666', 'step': 680, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 680', 'timestamp': '2025-09-04 03:51:09.446722', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:51:09.542085', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.06078000366687775, 'timestamp': '2025-09-04 03:51:09.562352', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:09.664340', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.017026284709572792, 'timestamp': '2025-09-04 03:51:09.683255', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:09.786570', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.03394745662808418, 'timestamp': '2025-09-04 03:51:09.805775', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:09.908393', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.03497334569692612, 'timestamp': '2025-09-04 03:51:09.928050', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:51:10.017677', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.025791537016630173, 'timestamp': '2025-09-04 03:51:10.036239', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:51:10.152751', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.049982357770204544, 'timestamp': '2025-09-04 03:51:10.174866', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:10.265581', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.023276111111044884, 'timestamp': '2025-09-04 03:51:10.282308', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:51:10.378533', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.016552327200770378, 'timestamp': '2025-09-04 03:51:10.396569', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:51:10.513537', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.014692885801196098, 'timestamp': '2025-09-04 03:51:10.537386', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:10.643966', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.08659341931343079, 'timestamp': '2025-09-04 03:51:10.663724', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:10.741883', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.030853156000375748, 'timestamp': '2025-09-04 03:51:10.755931', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:51:10.839386', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.030888579785823822, 'timestamp': '2025-09-04 03:51:10.854966', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:51:10.936300', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.03811494633555412, 'timestamp': '2025-09-04 03:51:10.952811', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:11.058803', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.013888245448470116, 'timestamp': '2025-09-04 03:51:11.078585', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:51:11.214227', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.03252805024385452, 'timestamp': '2025-09-04 03:51:11.239966', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:51:11.325914', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.057707663625478745, 'timestamp': '2025-09-04 03:51:11.342329', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:51:11.414814', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.013484358787536621, 'timestamp': '2025-09-04 03:51:11.429372', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:51:11.501648', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.05915412679314613, 'timestamp': '2025-09-04 03:51:11.514316', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:51:11.610602', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.024636950343847275, 'timestamp': '2025-09-04 03:51:11.627855', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:51:11.721668', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.017293041571974754, 'timestamp': '2025-09-04 03:51:11.739406', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:51:20.115650', 'step': 700, 'epoch': 1} {'type': 'pplx', 'content': 334.05662580498205, 'timestamp': '2025-09-04 03:51:20.117602', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:20.216569', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.020366515964269638, 'timestamp': '2025-09-04 03:51:20.237665', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:20.345409', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.03450450301170349, 'timestamp': '2025-09-04 03:51:20.365186', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:20.442718', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.031207023188471794, 'timestamp': '2025-09-04 03:51:20.456727', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:20.548080', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.09872627258300781, 'timestamp': '2025-09-04 03:51:20.565328', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:20.663398', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.03153887018561363, 'timestamp': '2025-09-04 03:51:20.683915', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:20.786006', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.025656569749116898, 'timestamp': '2025-09-04 03:51:20.805069', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:20.905492', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.007625031750649214, 'timestamp': '2025-09-04 03:51:20.924018', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:21.001433', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.013403641991317272, 'timestamp': '2025-09-04 03:51:21.016042', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:51:21.089639', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.04379529878497124, 'timestamp': '2025-09-04 03:51:21.104530', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:21.210734', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.03386307880282402, 'timestamp': '2025-09-04 03:51:21.230561', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:21.339667', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.023136505857110023, 'timestamp': '2025-09-04 03:51:21.360277', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:21.470082', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.01604538969695568, 'timestamp': '2025-09-04 03:51:21.491196', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:51:21.574067', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.0386708602309227, 'timestamp': '2025-09-04 03:51:21.590838', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:21.701235', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.00971259456127882, 'timestamp': '2025-09-04 03:51:21.721504', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:21.812160', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.06396154314279556, 'timestamp': '2025-09-04 03:51:21.829002', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:51:21.928434', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.044354457408189774, 'timestamp': '2025-09-04 03:51:21.947527', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:22.037338', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.02656322531402111, 'timestamp': '2025-09-04 03:51:22.055648', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:51:22.134993', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.03358420729637146, 'timestamp': '2025-09-04 03:51:22.148882', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:22.250678', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.008803064003586769, 'timestamp': '2025-09-04 03:51:22.269676', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:22.360223', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.1287224441766739, 'timestamp': '2025-09-04 03:51:22.377492', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:51:30.762033', 'step': 720, 'epoch': 1} {'type': 'pplx', 'content': 336.26355182577953, 'timestamp': '2025-09-04 03:51:30.763799', 'step': 720, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 720', 'timestamp': '2025-09-04 03:51:31.234372', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:51:31.352125', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.00398655841127038, 'timestamp': '2025-09-04 03:51:31.377462', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:51:31.461725', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.007859241217374802, 'timestamp': '2025-09-04 03:51:31.477012', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:51:31.571329', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.027930831536650658, 'timestamp': '2025-09-04 03:51:31.588467', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:51:31.687794', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.008873346261680126, 'timestamp': '2025-09-04 03:51:31.706953', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:31.783615', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.06903643906116486, 'timestamp': '2025-09-04 03:51:31.798877', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:31.901354', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.05149473994970322, 'timestamp': '2025-09-04 03:51:31.920403', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:51:31.996940', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.0763927474617958, 'timestamp': '2025-09-04 03:51:32.010521', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:32.113768', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.0026997928507626057, 'timestamp': '2025-09-04 03:51:32.133545', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:51:32.210082', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.0394502729177475, 'timestamp': '2025-09-04 03:51:32.225171', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:32.334298', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.009021886624395847, 'timestamp': '2025-09-04 03:51:32.354559', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:51:32.442076', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.00816608127206564, 'timestamp': '2025-09-04 03:51:32.457490', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 03:51:32.583086', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.03670935705304146, 'timestamp': '2025-09-04 03:51:32.607621', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:51:32.699631', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.056674499064683914, 'timestamp': '2025-09-04 03:51:32.718181', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:51:32.803873', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.14876963198184967, 'timestamp': '2025-09-04 03:51:32.818788', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:51:32.912540', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.012708455324172974, 'timestamp': '2025-09-04 03:51:32.929701', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:33.039167', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.09964840114116669, 'timestamp': '2025-09-04 03:51:33.060181', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:33.161479', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.03136486932635307, 'timestamp': '2025-09-04 03:51:33.181948', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:33.285762', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.04356590285897255, 'timestamp': '2025-09-04 03:51:33.304795', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:33.405495', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.01183301117271185, 'timestamp': '2025-09-04 03:51:33.424143', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:33.503460', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.02132677659392357, 'timestamp': '2025-09-04 03:51:33.518178', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:51:41.890219', 'step': 740, 'epoch': 1} {'type': 'pplx', 'content': 330.76754669519477, 'timestamp': '2025-09-04 03:51:41.892697', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:41.994167', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.044686876237392426, 'timestamp': '2025-09-04 03:51:42.015959', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:42.126745', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.04039061442017555, 'timestamp': '2025-09-04 03:51:42.147095', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:42.249214', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.022664330899715424, 'timestamp': '2025-09-04 03:51:42.267518', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:51:42.364434', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.02850925363600254, 'timestamp': '2025-09-04 03:51:42.382084', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:42.472301', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.03210241347551346, 'timestamp': '2025-09-04 03:51:42.490534', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:51:42.570864', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.0489024817943573, 'timestamp': '2025-09-04 03:51:42.584554', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:51:42.693167', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.034052006900310516, 'timestamp': '2025-09-04 03:51:42.713106', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:51:42.830107', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.012416801415383816, 'timestamp': '2025-09-04 03:51:42.852718', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:51:42.960820', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.05252353101968765, 'timestamp': '2025-09-04 03:51:42.983407', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:51:43.060970', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.021606309339404106, 'timestamp': '2025-09-04 03:51:43.074914', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:43.165463', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.009972968138754368, 'timestamp': '2025-09-04 03:51:43.182006', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:51:43.292002', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.017765356227755547, 'timestamp': '2025-09-04 03:51:43.313301', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:43.411771', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.04759746789932251, 'timestamp': '2025-09-04 03:51:43.432225', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:51:43.540540', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.023347793146967888, 'timestamp': '2025-09-04 03:51:43.560567', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:51:43.644484', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.04734707623720169, 'timestamp': '2025-09-04 03:51:43.659595', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:43.768987', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.026827994734048843, 'timestamp': '2025-09-04 03:51:43.790268', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:51:43.866644', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.055104807019233704, 'timestamp': '2025-09-04 03:51:43.881921', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:43.984924', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.03936297073960304, 'timestamp': '2025-09-04 03:51:44.004115', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:51:44.090732', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.07558504492044449, 'timestamp': '2025-09-04 03:51:44.106115', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:51:44.208853', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.011148090474307537, 'timestamp': '2025-09-04 03:51:44.228842', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:51:52.633240', 'step': 760, 'epoch': 1} {'type': 'pplx', 'content': 325.2410786741373, 'timestamp': '2025-09-04 03:51:52.635164', 'step': 760, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 760', 'timestamp': '2025-09-04 03:51:53.016229', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:51:53.098194', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.038804348558187485, 'timestamp': '2025-09-04 03:51:53.114486', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:53.225215', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.09771209210157394, 'timestamp': '2025-09-04 03:51:53.245500', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:51:53.339628', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.015294702723622322, 'timestamp': '2025-09-04 03:51:53.356673', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:53.457292', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.06018395349383354, 'timestamp': '2025-09-04 03:51:53.476673', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:51:53.573539', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.015591312199831009, 'timestamp': '2025-09-04 03:51:53.593665', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:51:53.686695', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.04299828037619591, 'timestamp': '2025-09-04 03:51:53.703554', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:53.806707', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.06639469414949417, 'timestamp': '2025-09-04 03:51:53.825270', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:51:53.927352', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.06221333146095276, 'timestamp': '2025-09-04 03:51:53.947041', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:51:54.053386', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.014062750153243542, 'timestamp': '2025-09-04 03:51:54.075653', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:51:54.184801', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.05087198689579964, 'timestamp': '2025-09-04 03:51:54.204773', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:54.305488', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.021481147035956383, 'timestamp': '2025-09-04 03:51:54.324163', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 03:51:54.394241', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.031000670045614243, 'timestamp': '2025-09-04 03:51:54.407850', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:51:54.499948', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.0011418497888371348, 'timestamp': '2025-09-04 03:51:54.518917', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:51:54.625581', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.02311205305159092, 'timestamp': '2025-09-04 03:51:54.645492', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 03:51:54.780476', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.02674124389886856, 'timestamp': '2025-09-04 03:51:54.806404', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:51:54.915425', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.04661679267883301, 'timestamp': '2025-09-04 03:51:54.936816', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:51:55.019120', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.04548424482345581, 'timestamp': '2025-09-04 03:51:55.036198', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:51:55.125560', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.007166591938585043, 'timestamp': '2025-09-04 03:51:55.142205', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:55.244199', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.06729776412248611, 'timestamp': '2025-09-04 03:51:55.262780', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:51:55.362266', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.007628207094967365, 'timestamp': '2025-09-04 03:51:55.381603', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:03.772736', 'step': 780, 'epoch': 1} {'type': 'pplx', 'content': 323.20181798344566, 'timestamp': '2025-09-04 03:52:03.775082', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:52:03.857745', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.004363184329122305, 'timestamp': '2025-09-04 03:52:03.874922', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:03.975884', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.01874414086341858, 'timestamp': '2025-09-04 03:52:03.994665', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:52:04.082345', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.0951443463563919, 'timestamp': '2025-09-04 03:52:04.097756', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:04.192694', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.09938696026802063, 'timestamp': '2025-09-04 03:52:04.210641', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:52:04.323748', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.010282545350492, 'timestamp': '2025-09-04 03:52:04.347897', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:04.446431', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.1636459231376648, 'timestamp': '2025-09-04 03:52:04.465085', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:52:04.551108', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.022835776209831238, 'timestamp': '2025-09-04 03:52:04.566559', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:52:04.688871', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.004113573580980301, 'timestamp': '2025-09-04 03:52:04.712558', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:04.813658', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.006749707739800215, 'timestamp': '2025-09-04 03:52:04.834669', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:04.935197', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.04576265811920166, 'timestamp': '2025-09-04 03:52:04.953879', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:05.047740', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.024862490594387054, 'timestamp': '2025-09-04 03:52:05.065114', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:05.165236', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.006702667102217674, 'timestamp': '2025-09-04 03:52:05.184551', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:05.276679', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.017640264704823494, 'timestamp': '2025-09-04 03:52:05.295657', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:05.390345', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.03208797425031662, 'timestamp': '2025-09-04 03:52:05.407251', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:05.508560', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.03511238843202591, 'timestamp': '2025-09-04 03:52:05.527172', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:05.620511', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.08914028853178024, 'timestamp': '2025-09-04 03:52:05.638141', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:05.741354', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.08203835040330887, 'timestamp': '2025-09-04 03:52:05.763282', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:05.867104', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.04981414973735809, 'timestamp': '2025-09-04 03:52:05.886381', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:05.985482', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.007873651571571827, 'timestamp': '2025-09-04 03:52:06.004060', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:06.106641', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.008369551040232182, 'timestamp': '2025-09-04 03:52:06.125955', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:14.582165', 'step': 800, 'epoch': 1} {'type': 'pplx', 'content': 322.3950710812259, 'timestamp': '2025-09-04 03:52:14.584469', 'step': 800, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 800', 'timestamp': '2025-09-04 03:52:14.943950', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:15.032165', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.013865088112652302, 'timestamp': '2025-09-04 03:52:15.050000', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:52:15.145702', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.011338443495333195, 'timestamp': '2025-09-04 03:52:15.162426', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:15.266835', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.01849699579179287, 'timestamp': '2025-09-04 03:52:15.285675', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 03:52:15.424546', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.046258725225925446, 'timestamp': '2025-09-04 03:52:15.450883', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:15.541096', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.06461029499769211, 'timestamp': '2025-09-04 03:52:15.558768', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:15.650764', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.010986247099936008, 'timestamp': '2025-09-04 03:52:15.667207', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:15.771785', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.002984520047903061, 'timestamp': '2025-09-04 03:52:15.790613', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:15.883372', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.014198306016623974, 'timestamp': '2025-09-04 03:52:15.900243', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:16.002786', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.03292492404580116, 'timestamp': '2025-09-04 03:52:16.023360', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:16.128536', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.031847354024648666, 'timestamp': '2025-09-04 03:52:16.147172', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:52:16.225296', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.0046399482525885105, 'timestamp': '2025-09-04 03:52:16.238472', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:16.350870', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.028976459056138992, 'timestamp': '2025-09-04 03:52:16.371461', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:16.471884', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.06571343541145325, 'timestamp': '2025-09-04 03:52:16.491658', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:16.597372', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.004271671175956726, 'timestamp': '2025-09-04 03:52:16.615972', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:16.721553', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.04265111684799194, 'timestamp': '2025-09-04 03:52:16.740168', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:16.843856', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.0795649066567421, 'timestamp': '2025-09-04 03:52:16.862808', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:16.965373', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.007744000293314457, 'timestamp': '2025-09-04 03:52:16.985818', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:52:17.097956', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.014986629597842693, 'timestamp': '2025-09-04 03:52:17.118420', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:52:17.240914', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.019025737419724464, 'timestamp': '2025-09-04 03:52:17.262418', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:17.369933', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.05677224323153496, 'timestamp': '2025-09-04 03:52:17.390488', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:25.880722', 'step': 820, 'epoch': 1} {'type': 'pplx', 'content': 324.48675427637124, 'timestamp': '2025-09-04 03:52:25.882971', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:25.983499', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.029658645391464233, 'timestamp': '2025-09-04 03:52:26.004547', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:52:26.093326', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.01038702204823494, 'timestamp': '2025-09-04 03:52:26.108769', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:26.214631', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.001975145423784852, 'timestamp': '2025-09-04 03:52:26.231760', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 03:52:26.416298', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.0497569777071476, 'timestamp': '2025-09-04 03:52:26.451588', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:26.569750', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.02662699669599533, 'timestamp': '2025-09-04 03:52:26.588463', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:26.697814', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.0038292373064905405, 'timestamp': '2025-09-04 03:52:26.718387', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:26.821113', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.07293792068958282, 'timestamp': '2025-09-04 03:52:26.840094', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:52:26.923604', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.13208356499671936, 'timestamp': '2025-09-04 03:52:26.939479', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:52:27.024504', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.03219275176525116, 'timestamp': '2025-09-04 03:52:27.041444', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:27.131547', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.00648118881508708, 'timestamp': '2025-09-04 03:52:27.148096', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:27.248076', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.04718891531229019, 'timestamp': '2025-09-04 03:52:27.266378', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:27.365554', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.004248224198818207, 'timestamp': '2025-09-04 03:52:27.384715', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 03:52:27.565166', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.010911565274000168, 'timestamp': '2025-09-04 03:52:27.603065', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:27.702939', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.036790501326322556, 'timestamp': '2025-09-04 03:52:27.721421', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:52:27.798834', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.08580999821424484, 'timestamp': '2025-09-04 03:52:27.812651', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:27.912675', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.00923218298703432, 'timestamp': '2025-09-04 03:52:27.932194', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:28.031507', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.007355087902396917, 'timestamp': '2025-09-04 03:52:28.052146', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:28.148182', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.017146985977888107, 'timestamp': '2025-09-04 03:52:28.165495', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:28.267178', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.021363992244005203, 'timestamp': '2025-09-04 03:52:28.286139', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:28.395064', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.07390808314085007, 'timestamp': '2025-09-04 03:52:28.416262', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:36.789891', 'step': 840, 'epoch': 1} {'type': 'pplx', 'content': 324.41111865398125, 'timestamp': '2025-09-04 03:52:36.791946', 'step': 840, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 840', 'timestamp': '2025-09-04 03:52:37.138794', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:37.228808', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.008902303874492645, 'timestamp': '2025-09-04 03:52:37.247382', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:52:37.322184', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.04457436874508858, 'timestamp': '2025-09-04 03:52:37.335432', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:52:37.443107', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.011131439357995987, 'timestamp': '2025-09-04 03:52:37.463335', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:37.571948', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.03997879475355148, 'timestamp': '2025-09-04 03:52:37.593109', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:37.681186', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.05826606974005699, 'timestamp': '2025-09-04 03:52:37.699242', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:37.798938', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.013584684580564499, 'timestamp': '2025-09-04 03:52:37.817704', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 03:52:38.022798', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.00445243064314127, 'timestamp': '2025-09-04 03:52:38.061808', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:52:38.163815', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.022547859698534012, 'timestamp': '2025-09-04 03:52:38.183460', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:52:38.289760', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.02277173474431038, 'timestamp': '2025-09-04 03:52:38.312237', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:52:38.396170', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.055064212530851364, 'timestamp': '2025-09-04 03:52:38.411132', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:52:38.494339', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.009578646160662174, 'timestamp': '2025-09-04 03:52:38.509205', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:38.603857', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.006946980953216553, 'timestamp': '2025-09-04 03:52:38.621878', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:38.723300', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.10980971902608871, 'timestamp': '2025-09-04 03:52:38.744250', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:38.836222', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.02813224121928215, 'timestamp': '2025-09-04 03:52:38.852773', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:38.952308', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.04458193480968475, 'timestamp': '2025-09-04 03:52:38.970909', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:52:39.048237', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.03821039944887161, 'timestamp': '2025-09-04 03:52:39.062827', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:39.152919', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.032031524926424026, 'timestamp': '2025-09-04 03:52:39.171567', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:52:39.266660', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.026080451905727386, 'timestamp': '2025-09-04 03:52:39.283145', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:39.393922', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.06203945353627205, 'timestamp': '2025-09-04 03:52:39.414479', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:39.518098', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.0699310228228569, 'timestamp': '2025-09-04 03:52:39.537950', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:47.899492', 'step': 860, 'epoch': 1} {'type': 'pplx', 'content': 322.49340806315365, 'timestamp': '2025-09-04 03:52:47.901562', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:48.003512', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.03510546684265137, 'timestamp': '2025-09-04 03:52:48.025476', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:48.129042', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.010071228258311749, 'timestamp': '2025-09-04 03:52:48.148376', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:48.251091', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.03212030977010727, 'timestamp': '2025-09-04 03:52:48.270097', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:48.375653', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.09204772859811783, 'timestamp': '2025-09-04 03:52:48.396300', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:48.486243', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.03038841113448143, 'timestamp': '2025-09-04 03:52:48.504904', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:48.606251', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.058843135833740234, 'timestamp': '2025-09-04 03:52:48.624971', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 03:52:48.753998', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.011757034808397293, 'timestamp': '2025-09-04 03:52:48.778443', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:48.883752', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.055646833032369614, 'timestamp': '2025-09-04 03:52:48.903198', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:48.999421', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.007672054693102837, 'timestamp': '2025-09-04 03:52:49.019789', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:49.126050', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.04655780270695686, 'timestamp': '2025-09-04 03:52:49.145853', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:52:49.239252', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.009051861241459846, 'timestamp': '2025-09-04 03:52:49.256412', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:52:49.340677', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.013100282289087772, 'timestamp': '2025-09-04 03:52:49.356697', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:52:49.437905', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.017356760799884796, 'timestamp': '2025-09-04 03:52:49.454466', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:49.557371', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.02461932599544525, 'timestamp': '2025-09-04 03:52:49.576385', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:52:49.683118', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.040192510932683945, 'timestamp': '2025-09-04 03:52:49.702845', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:49.803670', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.04767023026943207, 'timestamp': '2025-09-04 03:52:49.823120', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:49.930415', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.0067078592255711555, 'timestamp': '2025-09-04 03:52:49.952767', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:52:50.051711', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.0437939316034317, 'timestamp': '2025-09-04 03:52:50.070336', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:52:50.180351', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.00434476463124156, 'timestamp': '2025-09-04 03:52:50.200624', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:50.293161', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.02230999432504177, 'timestamp': '2025-09-04 03:52:50.310845', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:52:58.696210', 'step': 880, 'epoch': 1} {'type': 'pplx', 'content': 320.0679803850578, 'timestamp': '2025-09-04 03:52:58.697942', 'step': 880, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 880', 'timestamp': '2025-09-04 03:52:59.217621', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:59.308241', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.021877720952033997, 'timestamp': '2025-09-04 03:52:59.326920', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:52:59.423770', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.03120221011340618, 'timestamp': '2025-09-04 03:52:59.441082', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:59.533374', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.017093293368816376, 'timestamp': '2025-09-04 03:52:59.550315', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:52:59.654042', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.045136742293834686, 'timestamp': '2025-09-04 03:52:59.674000', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:52:59.771460', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.06527303159236908, 'timestamp': '2025-09-04 03:52:59.792160', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:52:59.887195', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.01749110408127308, 'timestamp': '2025-09-04 03:52:59.904116', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:00.010118', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.006269685458391905, 'timestamp': '2025-09-04 03:53:00.029842', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:00.134196', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.012028466910123825, 'timestamp': '2025-09-04 03:53:00.154044', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:53:00.265849', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.030681060627102852, 'timestamp': '2025-09-04 03:53:00.288336', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:00.394509', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.014401630498468876, 'timestamp': '2025-09-04 03:53:00.413831', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:53:00.497526', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.04923472926020622, 'timestamp': '2025-09-04 03:53:00.512537', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:00.611406', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.056294139474630356, 'timestamp': '2025-09-04 03:53:00.630919', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:53:00.735420', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.05868148058652878, 'timestamp': '2025-09-04 03:53:00.755788', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:00.857660', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.04890258610248566, 'timestamp': '2025-09-04 03:53:00.876619', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:00.976525', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.024650558829307556, 'timestamp': '2025-09-04 03:53:00.995278', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:01.094960', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.007203355897217989, 'timestamp': '2025-09-04 03:53:01.114386', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:01.215264', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.04748008772730827, 'timestamp': '2025-09-04 03:53:01.236042', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:01.342527', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.011301451362669468, 'timestamp': '2025-09-04 03:53:01.362187', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:01.452563', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.03385727107524872, 'timestamp': '2025-09-04 03:53:01.469235', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:01.575911', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.037546951323747635, 'timestamp': '2025-09-04 03:53:01.595843', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:53:09.952980', 'step': 900, 'epoch': 1} {'type': 'pplx', 'content': 319.89447585761627, 'timestamp': '2025-09-04 03:53:09.954474', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:10.052039', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.01678859256207943, 'timestamp': '2025-09-04 03:53:10.073130', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:10.166839', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.038133881986141205, 'timestamp': '2025-09-04 03:53:10.183993', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:10.274791', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.019613448530435562, 'timestamp': '2025-09-04 03:53:10.291441', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:53:10.379595', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.020688189193606377, 'timestamp': '2025-09-04 03:53:10.395757', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:10.487110', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.005966340657323599, 'timestamp': '2025-09-04 03:53:10.505940', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:10.607711', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.01767931506037712, 'timestamp': '2025-09-04 03:53:10.626611', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:10.719273', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.02633294090628624, 'timestamp': '2025-09-04 03:53:10.736157', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:10.839880', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.03189924731850624, 'timestamp': '2025-09-04 03:53:10.859731', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:10.962770', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.003187986556440592, 'timestamp': '2025-09-04 03:53:10.984587', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:11.084774', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.07128451019525528, 'timestamp': '2025-09-04 03:53:11.101434', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:11.213242', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.048821836709976196, 'timestamp': '2025-09-04 03:53:11.232355', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:11.347294', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.00408367533236742, 'timestamp': '2025-09-04 03:53:11.366967', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:11.473704', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.04687316343188286, 'timestamp': '2025-09-04 03:53:11.492664', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:11.586826', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.035709548741579056, 'timestamp': '2025-09-04 03:53:11.603966', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:11.707434', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.0572127141058445, 'timestamp': '2025-09-04 03:53:11.726695', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:11.816754', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.09570334106683731, 'timestamp': '2025-09-04 03:53:11.834156', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:11.932053', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.05318867787718773, 'timestamp': '2025-09-04 03:53:11.952724', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:12.054726', 'step': 917, 'epoch': 1} {'type': 'loss', 'content': 0.008588760159909725, 'timestamp': '2025-09-04 03:53:12.073695', 'step': 918, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:53:12.212450', 'step': 918, 'epoch': 1} {'type': 'loss', 'content': 0.007535313721746206, 'timestamp': '2025-09-04 03:53:12.238373', 'step': 919, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:12.332103', 'step': 919, 'epoch': 1} {'type': 'loss', 'content': 0.020334357395768166, 'timestamp': '2025-09-04 03:53:12.350325', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:53:20.755706', 'step': 920, 'epoch': 1} {'type': 'pplx', 'content': 323.29331082900484, 'timestamp': '2025-09-04 03:53:20.757985', 'step': 920, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 920', 'timestamp': '2025-09-04 03:53:21.106061', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:21.206709', 'step': 920, 'epoch': 1} {'type': 'loss', 'content': 0.004637205507606268, 'timestamp': '2025-09-04 03:53:21.227753', 'step': 921, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:53:21.303208', 'step': 921, 'epoch': 1} {'type': 'loss', 'content': 0.02929893508553505, 'timestamp': '2025-09-04 03:53:21.316393', 'step': 922, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:21.409671', 'step': 922, 'epoch': 1} {'type': 'loss', 'content': 0.05244254320859909, 'timestamp': '2025-09-04 03:53:21.426608', 'step': 923, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:53:21.561973', 'step': 923, 'epoch': 1} {'type': 'loss', 'content': 0.014228380285203457, 'timestamp': '2025-09-04 03:53:21.588607', 'step': 924, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:21.689260', 'step': 924, 'epoch': 1} {'type': 'loss', 'content': 0.014327945187687874, 'timestamp': '2025-09-04 03:53:21.710185', 'step': 925, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:53:21.806681', 'step': 925, 'epoch': 1} {'type': 'loss', 'content': 0.04167867451906204, 'timestamp': '2025-09-04 03:53:21.823957', 'step': 926, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:21.914206', 'step': 926, 'epoch': 1} {'type': 'loss', 'content': 0.004316235426813364, 'timestamp': '2025-09-04 03:53:21.930710', 'step': 927, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:53:22.006415', 'step': 927, 'epoch': 1} {'type': 'loss', 'content': 0.04848542809486389, 'timestamp': '2025-09-04 03:53:22.020751', 'step': 928, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:53:22.117538', 'step': 928, 'epoch': 1} {'type': 'loss', 'content': 0.008897616527974606, 'timestamp': '2025-09-04 03:53:22.137654', 'step': 929, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:53:22.246947', 'step': 929, 'epoch': 1} {'type': 'loss', 'content': 0.006201483774930239, 'timestamp': '2025-09-04 03:53:22.267302', 'step': 930, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:22.361441', 'step': 930, 'epoch': 1} {'type': 'loss', 'content': 0.06400009244680405, 'timestamp': '2025-09-04 03:53:22.378595', 'step': 931, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:53:22.502905', 'step': 931, 'epoch': 1} {'type': 'loss', 'content': 0.009115933440625668, 'timestamp': '2025-09-04 03:53:22.526648', 'step': 932, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:22.626849', 'step': 932, 'epoch': 1} {'type': 'loss', 'content': 0.01854240521788597, 'timestamp': '2025-09-04 03:53:22.647644', 'step': 933, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 03:53:22.821929', 'step': 933, 'epoch': 1} {'type': 'loss', 'content': 0.015260078944265842, 'timestamp': '2025-09-04 03:53:22.854322', 'step': 934, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:22.961777', 'step': 934, 'epoch': 1} {'type': 'loss', 'content': 0.009783388115465641, 'timestamp': '2025-09-04 03:53:22.981491', 'step': 935, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:23.090151', 'step': 935, 'epoch': 1} {'type': 'loss', 'content': 0.01038886234164238, 'timestamp': '2025-09-04 03:53:23.110577', 'step': 936, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:23.199339', 'step': 936, 'epoch': 1} {'type': 'loss', 'content': 0.04066663980484009, 'timestamp': '2025-09-04 03:53:23.217678', 'step': 937, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:23.324197', 'step': 937, 'epoch': 1} {'type': 'loss', 'content': 0.015022533014416695, 'timestamp': '2025-09-04 03:53:23.343914', 'step': 938, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:53:23.429503', 'step': 938, 'epoch': 1} {'type': 'loss', 'content': 0.019334964454174042, 'timestamp': '2025-09-04 03:53:23.444805', 'step': 939, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:23.551488', 'step': 939, 'epoch': 1} {'type': 'loss', 'content': 0.027674207463860512, 'timestamp': '2025-09-04 03:53:23.572051', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:53:31.943440', 'step': 940, 'epoch': 1} {'type': 'pplx', 'content': 324.7636312669318, 'timestamp': '2025-09-04 03:53:31.945581', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:32.047643', 'step': 940, 'epoch': 1} {'type': 'loss', 'content': 0.004062741529196501, 'timestamp': '2025-09-04 03:53:32.069548', 'step': 941, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:53:32.273285', 'step': 941, 'epoch': 1} {'type': 'loss', 'content': 0.11786609143018723, 'timestamp': '2025-09-04 03:53:32.312283', 'step': 942, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:53:32.390789', 'step': 942, 'epoch': 1} {'type': 'loss', 'content': 0.012708038091659546, 'timestamp': '2025-09-04 03:53:32.404914', 'step': 943, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:53:32.523185', 'step': 943, 'epoch': 1} {'type': 'loss', 'content': 0.05294226482510567, 'timestamp': '2025-09-04 03:53:32.546081', 'step': 944, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:53:32.652441', 'step': 944, 'epoch': 1} {'type': 'loss', 'content': 0.06667114049196243, 'timestamp': '2025-09-04 03:53:32.674610', 'step': 945, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:32.765158', 'step': 945, 'epoch': 1} {'type': 'loss', 'content': 0.017390718683600426, 'timestamp': '2025-09-04 03:53:32.781730', 'step': 946, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:53:32.866576', 'step': 946, 'epoch': 1} {'type': 'loss', 'content': 0.003352563362568617, 'timestamp': '2025-09-04 03:53:32.881804', 'step': 947, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:32.984148', 'step': 947, 'epoch': 1} {'type': 'loss', 'content': 0.059882860630750656, 'timestamp': '2025-09-04 03:53:33.003916', 'step': 948, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:53:33.096962', 'step': 948, 'epoch': 1} {'type': 'loss', 'content': 0.04028277471661568, 'timestamp': '2025-09-04 03:53:33.115970', 'step': 949, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 03:53:33.290357', 'step': 949, 'epoch': 1} {'type': 'loss', 'content': 0.018983660265803337, 'timestamp': '2025-09-04 03:53:33.324730', 'step': 950, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:33.418146', 'step': 950, 'epoch': 1} {'type': 'loss', 'content': 0.01843661069869995, 'timestamp': '2025-09-04 03:53:33.435067', 'step': 951, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:53:33.518359', 'step': 951, 'epoch': 1} {'type': 'loss', 'content': 0.029021859169006348, 'timestamp': '2025-09-04 03:53:33.534111', 'step': 952, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:53:33.617540', 'step': 952, 'epoch': 1} {'type': 'loss', 'content': 0.01326957531273365, 'timestamp': '2025-09-04 03:53:33.634605', 'step': 953, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:53:33.713033', 'step': 953, 'epoch': 1} {'type': 'loss', 'content': 0.0053786844946444035, 'timestamp': '2025-09-04 03:53:33.726795', 'step': 954, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:33.829268', 'step': 954, 'epoch': 1} {'type': 'loss', 'content': 0.009815702214837074, 'timestamp': '2025-09-04 03:53:33.848427', 'step': 955, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:53:33.943180', 'step': 955, 'epoch': 1} {'type': 'loss', 'content': 0.012396165169775486, 'timestamp': '2025-09-04 03:53:33.961066', 'step': 956, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:53:34.067090', 'step': 956, 'epoch': 1} {'type': 'loss', 'content': 0.01327445451170206, 'timestamp': '2025-09-04 03:53:34.089193', 'step': 957, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:53:34.292580', 'step': 957, 'epoch': 1} {'type': 'loss', 'content': 0.0052812471985816956, 'timestamp': '2025-09-04 03:53:34.331717', 'step': 958, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:34.436182', 'step': 958, 'epoch': 1} {'type': 'loss', 'content': 0.037195418030023575, 'timestamp': '2025-09-04 03:53:34.455354', 'step': 959, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:53:34.554713', 'step': 959, 'epoch': 1} {'type': 'loss', 'content': 0.0028625449631363153, 'timestamp': '2025-09-04 03:53:34.574149', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:53:42.954180', 'step': 960, 'epoch': 1} {'type': 'pplx', 'content': 327.4406282794405, 'timestamp': '2025-09-04 03:53:42.956295', 'step': 960, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 960', 'timestamp': '2025-09-04 03:53:43.310072', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:53:43.426090', 'step': 960, 'epoch': 1} {'type': 'loss', 'content': 0.11174288392066956, 'timestamp': '2025-09-04 03:53:43.449893', 'step': 961, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:43.552254', 'step': 961, 'epoch': 1} {'type': 'loss', 'content': 0.06079157814383507, 'timestamp': '2025-09-04 03:53:43.571555', 'step': 962, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:43.674623', 'step': 962, 'epoch': 1} {'type': 'loss', 'content': 0.007762270979583263, 'timestamp': '2025-09-04 03:53:43.693819', 'step': 963, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:53:43.788814', 'step': 963, 'epoch': 1} {'type': 'loss', 'content': 0.1044481098651886, 'timestamp': '2025-09-04 03:53:43.807251', 'step': 964, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:53:43.898657', 'step': 964, 'epoch': 1} {'type': 'loss', 'content': 0.04733144864439964, 'timestamp': '2025-09-04 03:53:43.917440', 'step': 965, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:44.018358', 'step': 965, 'epoch': 1} {'type': 'loss', 'content': 0.04005102813243866, 'timestamp': '2025-09-04 03:53:44.037363', 'step': 966, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:44.139306', 'step': 966, 'epoch': 1} {'type': 'loss', 'content': 0.015302048996090889, 'timestamp': '2025-09-04 03:53:44.158514', 'step': 967, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:44.262372', 'step': 967, 'epoch': 1} {'type': 'loss', 'content': 0.04137500002980232, 'timestamp': '2025-09-04 03:53:44.282149', 'step': 968, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:53:44.355720', 'step': 968, 'epoch': 1} {'type': 'loss', 'content': 0.012012657709419727, 'timestamp': '2025-09-04 03:53:44.370202', 'step': 969, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:44.473736', 'step': 969, 'epoch': 1} {'type': 'loss', 'content': 0.0035866987891495228, 'timestamp': '2025-09-04 03:53:44.492727', 'step': 970, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:44.593769', 'step': 970, 'epoch': 1} {'type': 'loss', 'content': 0.011977934278547764, 'timestamp': '2025-09-04 03:53:44.612318', 'step': 971, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:53:44.712118', 'step': 971, 'epoch': 1} {'type': 'loss', 'content': 0.010067245922982693, 'timestamp': '2025-09-04 03:53:44.731252', 'step': 972, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:44.819828', 'step': 972, 'epoch': 1} {'type': 'loss', 'content': 0.025547686964273453, 'timestamp': '2025-09-04 03:53:44.837978', 'step': 973, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:53:44.915441', 'step': 973, 'epoch': 1} {'type': 'loss', 'content': 0.037151534110307693, 'timestamp': '2025-09-04 03:53:44.929215', 'step': 974, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 03:53:45.143074', 'step': 974, 'epoch': 1} {'type': 'loss', 'content': 0.027699746191501617, 'timestamp': '2025-09-04 03:53:45.184042', 'step': 975, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:53:45.279228', 'step': 975, 'epoch': 1} {'type': 'loss', 'content': 0.05770542845129967, 'timestamp': '2025-09-04 03:53:45.297565', 'step': 976, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:53:45.373416', 'step': 976, 'epoch': 1} {'type': 'loss', 'content': 0.010280147194862366, 'timestamp': '2025-09-04 03:53:45.388732', 'step': 977, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:45.491256', 'step': 977, 'epoch': 1} {'type': 'loss', 'content': 0.016326548531651497, 'timestamp': '2025-09-04 03:53:45.510479', 'step': 978, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 03:53:45.637463', 'step': 978, 'epoch': 1} {'type': 'loss', 'content': 0.0350724533200264, 'timestamp': '2025-09-04 03:53:45.661885', 'step': 979, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:45.765390', 'step': 979, 'epoch': 1} {'type': 'loss', 'content': 0.012970902025699615, 'timestamp': '2025-09-04 03:53:45.785448', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:53:54.156759', 'step': 980, 'epoch': 1} {'type': 'pplx', 'content': 326.69142554401174, 'timestamp': '2025-09-04 03:53:54.158794', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:53:54.233062', 'step': 980, 'epoch': 1} {'type': 'loss', 'content': 0.008149470202624798, 'timestamp': '2025-09-04 03:53:54.248341', 'step': 981, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:53:54.351582', 'step': 981, 'epoch': 1} {'type': 'loss', 'content': 0.04477942734956741, 'timestamp': '2025-09-04 03:53:54.370852', 'step': 982, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:54.471887', 'step': 982, 'epoch': 1} {'type': 'loss', 'content': 0.04230527952313423, 'timestamp': '2025-09-04 03:53:54.490758', 'step': 983, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:53:54.569763', 'step': 983, 'epoch': 1} {'type': 'loss', 'content': 0.021094702184200287, 'timestamp': '2025-09-04 03:53:54.584677', 'step': 984, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:54.681680', 'step': 984, 'epoch': 1} {'type': 'loss', 'content': 0.050900258123874664, 'timestamp': '2025-09-04 03:53:54.702364', 'step': 985, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:53:54.810576', 'step': 985, 'epoch': 1} {'type': 'loss', 'content': 0.014919820241630077, 'timestamp': '2025-09-04 03:53:54.830622', 'step': 986, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:54.921462', 'step': 986, 'epoch': 1} {'type': 'loss', 'content': 0.0671142116189003, 'timestamp': '2025-09-04 03:53:54.938186', 'step': 987, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:53:55.024616', 'step': 987, 'epoch': 1} {'type': 'loss', 'content': 0.04901759326457977, 'timestamp': '2025-09-04 03:53:55.040769', 'step': 988, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:53:55.129148', 'step': 988, 'epoch': 1} {'type': 'loss', 'content': 0.02041156031191349, 'timestamp': '2025-09-04 03:53:55.147314', 'step': 989, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:53:55.222313', 'step': 989, 'epoch': 1} {'type': 'loss', 'content': 0.03128139674663544, 'timestamp': '2025-09-04 03:53:55.236053', 'step': 990, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:55.328730', 'step': 990, 'epoch': 1} {'type': 'loss', 'content': 0.1105596199631691, 'timestamp': '2025-09-04 03:53:55.345640', 'step': 991, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:55.446529', 'step': 991, 'epoch': 1} {'type': 'loss', 'content': 0.2516331374645233, 'timestamp': '2025-09-04 03:53:55.465931', 'step': 992, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:53:55.565425', 'step': 992, 'epoch': 1} {'type': 'loss', 'content': 0.006421744357794523, 'timestamp': '2025-09-04 03:53:55.586207', 'step': 993, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:55.678970', 'step': 993, 'epoch': 1} {'type': 'loss', 'content': 0.042763665318489075, 'timestamp': '2025-09-04 03:53:55.695888', 'step': 994, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:55.797442', 'step': 994, 'epoch': 1} {'type': 'loss', 'content': 0.004555892664939165, 'timestamp': '2025-09-04 03:53:55.816067', 'step': 995, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:53:55.922577', 'step': 995, 'epoch': 1} {'type': 'loss', 'content': 0.04880642145872116, 'timestamp': '2025-09-04 03:53:55.943075', 'step': 996, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:56.040785', 'step': 996, 'epoch': 1} {'type': 'loss', 'content': 0.029496213421225548, 'timestamp': '2025-09-04 03:53:56.061263', 'step': 997, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:53:56.154620', 'step': 997, 'epoch': 1} {'type': 'loss', 'content': 0.01086695957928896, 'timestamp': '2025-09-04 03:53:56.171470', 'step': 998, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:53:56.272119', 'step': 998, 'epoch': 1} {'type': 'loss', 'content': 0.03509443625807762, 'timestamp': '2025-09-04 03:53:56.290774', 'step': 999, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:53:56.400883', 'step': 999, 'epoch': 1} {'type': 'loss', 'content': 0.05549605190753937, 'timestamp': '2025-09-04 03:53:56.422038', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:54:04.789966', 'step': 1000, 'epoch': 1} {'type': 'pplx', 'content': 323.93892790465287, 'timestamp': '2025-09-04 03:54:04.792236', 'step': 1000, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-04 03:54:05.144251', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:54:05.217721', 'step': 1000, 'epoch': 1} {'type': 'loss', 'content': 0.03098498098552227, 'timestamp': '2025-09-04 03:54:05.232637', 'step': 1001, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:05.333112', 'step': 1001, 'epoch': 1} {'type': 'loss', 'content': 0.018459502607584, 'timestamp': '2025-09-04 03:54:05.352002', 'step': 1002, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:54:05.436559', 'step': 1002, 'epoch': 1} {'type': 'loss', 'content': 0.045362938195466995, 'timestamp': '2025-09-04 03:54:05.451997', 'step': 1003, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:54:05.546333', 'step': 1003, 'epoch': 1} {'type': 'loss', 'content': 0.013697554357349873, 'timestamp': '2025-09-04 03:54:05.564241', 'step': 1004, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:05.656441', 'step': 1004, 'epoch': 1} {'type': 'loss', 'content': 0.014663312584161758, 'timestamp': '2025-09-04 03:54:05.675346', 'step': 1005, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:05.767072', 'step': 1005, 'epoch': 1} {'type': 'loss', 'content': 0.033350620418787, 'timestamp': '2025-09-04 03:54:05.783605', 'step': 1006, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:05.887104', 'step': 1006, 'epoch': 1} {'type': 'loss', 'content': 0.0400872677564621, 'timestamp': '2025-09-04 03:54:05.906229', 'step': 1007, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:54:06.015408', 'step': 1007, 'epoch': 1} {'type': 'loss', 'content': 0.03101273812353611, 'timestamp': '2025-09-04 03:54:06.036432', 'step': 1008, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:54:06.129001', 'step': 1008, 'epoch': 1} {'type': 'loss', 'content': 0.01906219683587551, 'timestamp': '2025-09-04 03:54:06.147639', 'step': 1009, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:06.250583', 'step': 1009, 'epoch': 1} {'type': 'loss', 'content': 0.015632882714271545, 'timestamp': '2025-09-04 03:54:06.269673', 'step': 1010, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:06.373501', 'step': 1010, 'epoch': 1} {'type': 'loss', 'content': 0.015542350709438324, 'timestamp': '2025-09-04 03:54:06.392552', 'step': 1011, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:06.486372', 'step': 1011, 'epoch': 1} {'type': 'loss', 'content': 0.010193181224167347, 'timestamp': '2025-09-04 03:54:06.504449', 'step': 1012, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:06.604714', 'step': 1012, 'epoch': 1} {'type': 'loss', 'content': 0.009116173721849918, 'timestamp': '2025-09-04 03:54:06.625559', 'step': 1013, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:54:06.734943', 'step': 1013, 'epoch': 1} {'type': 'loss', 'content': 0.011737700551748276, 'timestamp': '2025-09-04 03:54:06.755347', 'step': 1014, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:06.858522', 'step': 1014, 'epoch': 1} {'type': 'loss', 'content': 0.05300810933113098, 'timestamp': '2025-09-04 03:54:06.877568', 'step': 1015, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:54:06.964396', 'step': 1015, 'epoch': 1} {'type': 'loss', 'content': 0.016496986150741577, 'timestamp': '2025-09-04 03:54:06.980756', 'step': 1016, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:54:07.063396', 'step': 1016, 'epoch': 1} {'type': 'loss', 'content': 0.06345777213573456, 'timestamp': '2025-09-04 03:54:07.080148', 'step': 1017, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:07.171364', 'step': 1017, 'epoch': 1} {'type': 'loss', 'content': 0.01955530233681202, 'timestamp': '2025-09-04 03:54:07.188018', 'step': 1018, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:07.288317', 'step': 1018, 'epoch': 1} {'type': 'loss', 'content': 0.009021622128784657, 'timestamp': '2025-09-04 03:54:07.306933', 'step': 1019, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:07.407378', 'step': 1019, 'epoch': 1} {'type': 'loss', 'content': 0.02454635314643383, 'timestamp': '2025-09-04 03:54:07.426759', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:54:15.807366', 'step': 1020, 'epoch': 1} {'type': 'pplx', 'content': 322.27552972591826, 'timestamp': '2025-09-04 03:54:15.809410', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:54:15.888783', 'step': 1020, 'epoch': 1} {'type': 'loss', 'content': 0.04511036351323128, 'timestamp': '2025-09-04 03:54:15.905147', 'step': 1021, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:54:16.013227', 'step': 1021, 'epoch': 1} {'type': 'loss', 'content': 0.12731988728046417, 'timestamp': '2025-09-04 03:54:16.033415', 'step': 1022, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 03:54:16.169915', 'step': 1022, 'epoch': 1} {'type': 'loss', 'content': 0.05131891369819641, 'timestamp': '2025-09-04 03:54:16.195930', 'step': 1023, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:16.297930', 'step': 1023, 'epoch': 1} {'type': 'loss', 'content': 0.020861327648162842, 'timestamp': '2025-09-04 03:54:16.317755', 'step': 1024, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:54:16.420449', 'step': 1024, 'epoch': 1} {'type': 'loss', 'content': 0.05731099098920822, 'timestamp': '2025-09-04 03:54:16.442256', 'step': 1025, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:16.544840', 'step': 1025, 'epoch': 1} {'type': 'loss', 'content': 0.04736688733100891, 'timestamp': '2025-09-04 03:54:16.563934', 'step': 1026, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:16.662732', 'step': 1026, 'epoch': 1} {'type': 'loss', 'content': 0.024726245552301407, 'timestamp': '2025-09-04 03:54:16.681418', 'step': 1027, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:54:16.764441', 'step': 1027, 'epoch': 1} {'type': 'loss', 'content': 0.07053575664758682, 'timestamp': '2025-09-04 03:54:16.780300', 'step': 1028, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:54:16.886324', 'step': 1028, 'epoch': 1} {'type': 'loss', 'content': 0.003608345054090023, 'timestamp': '2025-09-04 03:54:16.908667', 'step': 1029, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:54:17.003170', 'step': 1029, 'epoch': 1} {'type': 'loss', 'content': 0.01059285830706358, 'timestamp': '2025-09-04 03:54:17.020000', 'step': 1030, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:17.120442', 'step': 1030, 'epoch': 1} {'type': 'loss', 'content': 0.026861051097512245, 'timestamp': '2025-09-04 03:54:17.139094', 'step': 1031, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 03:54:17.313090', 'step': 1031, 'epoch': 1} {'type': 'loss', 'content': 0.012563884258270264, 'timestamp': '2025-09-04 03:54:17.346335', 'step': 1032, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:54:17.453087', 'step': 1032, 'epoch': 1} {'type': 'loss', 'content': 0.008008824661374092, 'timestamp': '2025-09-04 03:54:17.475593', 'step': 1033, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:17.578569', 'step': 1033, 'epoch': 1} {'type': 'loss', 'content': 0.004494689870625734, 'timestamp': '2025-09-04 03:54:17.597833', 'step': 1034, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:54:17.676279', 'step': 1034, 'epoch': 1} {'type': 'loss', 'content': 0.05889247730374336, 'timestamp': '2025-09-04 03:54:17.690066', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:17.794363', 'step': 1035, 'epoch': 1} {'type': 'loss', 'content': 0.03641991689801216, 'timestamp': '2025-09-04 03:54:17.814291', 'step': 1036, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:17.915191', 'step': 1036, 'epoch': 1} {'type': 'loss', 'content': 0.015427891165018082, 'timestamp': '2025-09-04 03:54:17.936059', 'step': 1037, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:54:18.047638', 'step': 1037, 'epoch': 1} {'type': 'loss', 'content': 0.04161277413368225, 'timestamp': '2025-09-04 03:54:18.068089', 'step': 1038, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:18.169531', 'step': 1038, 'epoch': 1} {'type': 'loss', 'content': 0.013311103917658329, 'timestamp': '2025-09-04 03:54:18.188142', 'step': 1039, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:18.289767', 'step': 1039, 'epoch': 1} {'type': 'loss', 'content': 0.056311286985874176, 'timestamp': '2025-09-04 03:54:18.309120', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:54:26.699707', 'step': 1040, 'epoch': 1} {'type': 'pplx', 'content': 325.4783086719936, 'timestamp': '2025-09-04 03:54:26.702081', 'step': 1040, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1040', 'timestamp': '2025-09-04 03:54:27.217343', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:54:27.293442', 'step': 1040, 'epoch': 1} {'type': 'loss', 'content': 0.005170788615942001, 'timestamp': '2025-09-04 03:54:27.308800', 'step': 1041, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:27.411906', 'step': 1041, 'epoch': 1} {'type': 'loss', 'content': 0.0431942418217659, 'timestamp': '2025-09-04 03:54:27.430779', 'step': 1042, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 03:54:27.562183', 'step': 1042, 'epoch': 1} {'type': 'loss', 'content': 0.003155430080369115, 'timestamp': '2025-09-04 03:54:27.585655', 'step': 1043, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:27.676253', 'step': 1043, 'epoch': 1} {'type': 'loss', 'content': 0.031002743169665337, 'timestamp': '2025-09-04 03:54:27.693852', 'step': 1044, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1008], 'flops': 20160122450880.0}, 'timestamp': '2025-09-04 03:54:27.835708', 'step': 1044, 'epoch': 1} {'type': 'loss', 'content': 0.0061140297912061214, 'timestamp': '2025-09-04 03:54:27.866848', 'step': 1045, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:27.970250', 'step': 1045, 'epoch': 1} {'type': 'loss', 'content': 0.024821752682328224, 'timestamp': '2025-09-04 03:54:27.989454', 'step': 1046, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:54:28.097849', 'step': 1046, 'epoch': 1} {'type': 'loss', 'content': 0.02534925378859043, 'timestamp': '2025-09-04 03:54:28.118164', 'step': 1047, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:28.217521', 'step': 1047, 'epoch': 1} {'type': 'loss', 'content': 0.06829417496919632, 'timestamp': '2025-09-04 03:54:28.236845', 'step': 1048, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:28.337902', 'step': 1048, 'epoch': 1} {'type': 'loss', 'content': 0.011538490653038025, 'timestamp': '2025-09-04 03:54:28.359059', 'step': 1049, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:28.460838', 'step': 1049, 'epoch': 1} {'type': 'loss', 'content': 0.017921442165970802, 'timestamp': '2025-09-04 03:54:28.479791', 'step': 1050, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:54:28.557920', 'step': 1050, 'epoch': 1} {'type': 'loss', 'content': 0.00594189902767539, 'timestamp': '2025-09-04 03:54:28.572056', 'step': 1051, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:28.661977', 'step': 1051, 'epoch': 1} {'type': 'loss', 'content': 0.06706573069095612, 'timestamp': '2025-09-04 03:54:28.679555', 'step': 1052, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:54:28.793603', 'step': 1052, 'epoch': 1} {'type': 'loss', 'content': 0.07539967447519302, 'timestamp': '2025-09-04 03:54:28.817899', 'step': 1053, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:28.911701', 'step': 1053, 'epoch': 1} {'type': 'loss', 'content': 0.01035250723361969, 'timestamp': '2025-09-04 03:54:28.929113', 'step': 1054, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:54:29.003669', 'step': 1054, 'epoch': 1} {'type': 'loss', 'content': 0.024351386353373528, 'timestamp': '2025-09-04 03:54:29.017233', 'step': 1055, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:29.122010', 'step': 1055, 'epoch': 1} {'type': 'loss', 'content': 0.041539065539836884, 'timestamp': '2025-09-04 03:54:29.141863', 'step': 1056, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:29.238138', 'step': 1056, 'epoch': 1} {'type': 'loss', 'content': 0.03460566699504852, 'timestamp': '2025-09-04 03:54:29.258519', 'step': 1057, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:54:29.393251', 'step': 1057, 'epoch': 1} {'type': 'loss', 'content': 0.005672653205692768, 'timestamp': '2025-09-04 03:54:29.418907', 'step': 1058, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:29.520526', 'step': 1058, 'epoch': 1} {'type': 'loss', 'content': 0.04056652635335922, 'timestamp': '2025-09-04 03:54:29.539428', 'step': 1059, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:54:29.625500', 'step': 1059, 'epoch': 1} {'type': 'loss', 'content': 0.018356602638959885, 'timestamp': '2025-09-04 03:54:29.641923', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:54:38.037867', 'step': 1060, 'epoch': 1} {'type': 'pplx', 'content': 329.5744553045225, 'timestamp': '2025-09-04 03:54:38.039700', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:38.137979', 'step': 1060, 'epoch': 1} {'type': 'loss', 'content': 0.018076708540320396, 'timestamp': '2025-09-04 03:54:38.159151', 'step': 1061, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:54:38.238436', 'step': 1061, 'epoch': 1} {'type': 'loss', 'content': 0.008117503486573696, 'timestamp': '2025-09-04 03:54:38.252549', 'step': 1062, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:54:38.330718', 'step': 1062, 'epoch': 1} {'type': 'loss', 'content': 0.09541762620210648, 'timestamp': '2025-09-04 03:54:38.344655', 'step': 1063, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:54:38.455471', 'step': 1063, 'epoch': 1} {'type': 'loss', 'content': 0.004544029477983713, 'timestamp': '2025-09-04 03:54:38.476657', 'step': 1064, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:54:38.553048', 'step': 1064, 'epoch': 1} {'type': 'loss', 'content': 0.018085738644003868, 'timestamp': '2025-09-04 03:54:38.568555', 'step': 1065, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:38.672167', 'step': 1065, 'epoch': 1} {'type': 'loss', 'content': 0.008873535320162773, 'timestamp': '2025-09-04 03:54:38.691241', 'step': 1066, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:54:38.798973', 'step': 1066, 'epoch': 1} {'type': 'loss', 'content': 0.016320547088980675, 'timestamp': '2025-09-04 03:54:38.818751', 'step': 1067, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:38.923333', 'step': 1067, 'epoch': 1} {'type': 'loss', 'content': 0.04694967344403267, 'timestamp': '2025-09-04 03:54:38.943139', 'step': 1068, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:54:39.051635', 'step': 1068, 'epoch': 1} {'type': 'loss', 'content': 0.05457077920436859, 'timestamp': '2025-09-04 03:54:39.073300', 'step': 1069, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:39.169792', 'step': 1069, 'epoch': 1} {'type': 'loss', 'content': 0.006687942426651716, 'timestamp': '2025-09-04 03:54:39.186448', 'step': 1070, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:39.281245', 'step': 1070, 'epoch': 1} {'type': 'loss', 'content': 0.019025299698114395, 'timestamp': '2025-09-04 03:54:39.298353', 'step': 1071, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:39.390091', 'step': 1071, 'epoch': 1} {'type': 'loss', 'content': 0.020227275788784027, 'timestamp': '2025-09-04 03:54:39.407340', 'step': 1072, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:54:39.500568', 'step': 1072, 'epoch': 1} {'type': 'loss', 'content': 0.05452266335487366, 'timestamp': '2025-09-04 03:54:39.519561', 'step': 1073, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:54:39.604031', 'step': 1073, 'epoch': 1} {'type': 'loss', 'content': 0.027135973796248436, 'timestamp': '2025-09-04 03:54:39.618960', 'step': 1074, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:39.718358', 'step': 1074, 'epoch': 1} {'type': 'loss', 'content': 0.027994517236948013, 'timestamp': '2025-09-04 03:54:39.736747', 'step': 1075, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:54:39.843238', 'step': 1075, 'epoch': 1} {'type': 'loss', 'content': 0.023070676252245903, 'timestamp': '2025-09-04 03:54:39.863118', 'step': 1076, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:39.963035', 'step': 1076, 'epoch': 1} {'type': 'loss', 'content': 0.01172536239027977, 'timestamp': '2025-09-04 03:54:39.983911', 'step': 1077, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:54:40.070577', 'step': 1077, 'epoch': 1} {'type': 'loss', 'content': 0.029984669759869576, 'timestamp': '2025-09-04 03:54:40.086018', 'step': 1078, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:40.179735', 'step': 1078, 'epoch': 1} {'type': 'loss', 'content': 0.024687113240361214, 'timestamp': '2025-09-04 03:54:40.196981', 'step': 1079, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:40.300861', 'step': 1079, 'epoch': 1} {'type': 'loss', 'content': 0.008470497094094753, 'timestamp': '2025-09-04 03:54:40.320739', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:54:48.711208', 'step': 1080, 'epoch': 1} {'type': 'pplx', 'content': 331.8358204811036, 'timestamp': '2025-09-04 03:54:48.713485', 'step': 1080, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1080', 'timestamp': '2025-09-04 03:54:49.226299', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:49.323088', 'step': 1080, 'epoch': 1} {'type': 'loss', 'content': 0.08154661953449249, 'timestamp': '2025-09-04 03:54:49.343622', 'step': 1081, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:54:49.448147', 'step': 1081, 'epoch': 1} {'type': 'loss', 'content': 0.028146987780928612, 'timestamp': '2025-09-04 03:54:49.468034', 'step': 1082, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:49.571712', 'step': 1082, 'epoch': 1} {'type': 'loss', 'content': 0.024996791034936905, 'timestamp': '2025-09-04 03:54:49.590885', 'step': 1083, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1424], 'flops': 28480172958272.0}, 'timestamp': '2025-09-04 03:54:49.802309', 'step': 1083, 'epoch': 1} {'type': 'loss', 'content': 0.03116009198129177, 'timestamp': '2025-09-04 03:54:49.843613', 'step': 1084, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:49.941437', 'step': 1084, 'epoch': 1} {'type': 'loss', 'content': 0.032337624579668045, 'timestamp': '2025-09-04 03:54:49.961390', 'step': 1085, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:54:50.058815', 'step': 1085, 'epoch': 1} {'type': 'loss', 'content': 0.07175617665052414, 'timestamp': '2025-09-04 03:54:50.075368', 'step': 1086, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:50.169947', 'step': 1086, 'epoch': 1} {'type': 'loss', 'content': 0.0184723399579525, 'timestamp': '2025-09-04 03:54:50.186963', 'step': 1087, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:50.291628', 'step': 1087, 'epoch': 1} {'type': 'loss', 'content': 0.03585413470864296, 'timestamp': '2025-09-04 03:54:50.311636', 'step': 1088, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:54:50.392897', 'step': 1088, 'epoch': 1} {'type': 'loss', 'content': 0.03618989884853363, 'timestamp': '2025-09-04 03:54:50.409370', 'step': 1089, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:54:50.518834', 'step': 1089, 'epoch': 1} {'type': 'loss', 'content': 0.05040483921766281, 'timestamp': '2025-09-04 03:54:50.538855', 'step': 1090, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:54:50.639859', 'step': 1090, 'epoch': 1} {'type': 'loss', 'content': 0.05431760847568512, 'timestamp': '2025-09-04 03:54:50.658560', 'step': 1091, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:54:50.753968', 'step': 1091, 'epoch': 1} {'type': 'loss', 'content': 0.10624273121356964, 'timestamp': '2025-09-04 03:54:50.772035', 'step': 1092, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:54:50.844993', 'step': 1092, 'epoch': 1} {'type': 'loss', 'content': 0.03237966448068619, 'timestamp': '2025-09-04 03:54:50.859614', 'step': 1093, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:54:50.958245', 'step': 1093, 'epoch': 1} {'type': 'loss', 'content': 0.04587221145629883, 'timestamp': '2025-09-04 03:54:50.976639', 'step': 1094, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:54:51.094402', 'step': 1094, 'epoch': 1} {'type': 'loss', 'content': 0.028815526515245438, 'timestamp': '2025-09-04 03:54:51.116563', 'step': 1095, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:54:51.220135', 'step': 1095, 'epoch': 1} {'type': 'loss', 'content': 0.013644875027239323, 'timestamp': '2025-09-04 03:54:51.239889', 'step': 1096, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 03:54:51.384380', 'step': 1096, 'epoch': 1} {'type': 'loss', 'content': 0.03480622544884682, 'timestamp': '2025-09-04 03:54:51.415384', 'step': 1097, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:54:51.523257', 'step': 1097, 'epoch': 1} {'type': 'loss', 'content': 0.03265468776226044, 'timestamp': '2025-09-04 03:54:51.543382', 'step': 1098, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:54:51.638835', 'step': 1098, 'epoch': 1} {'type': 'loss', 'content': 0.003582942998036742, 'timestamp': '2025-09-04 03:54:51.656106', 'step': 1099, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:54:51.749819', 'step': 1099, 'epoch': 1} {'type': 'loss', 'content': 0.058865927159786224, 'timestamp': '2025-09-04 03:54:51.767754', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:00.169117', 'step': 1100, 'epoch': 1} {'type': 'pplx', 'content': 330.568579902323, 'timestamp': '2025-09-04 03:55:00.171543', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:00.268289', 'step': 1100, 'epoch': 1} {'type': 'loss', 'content': 0.06865076720714569, 'timestamp': '2025-09-04 03:55:00.288903', 'step': 1101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:00.390269', 'step': 1101, 'epoch': 1} {'type': 'loss', 'content': 0.10805238038301468, 'timestamp': '2025-09-04 03:55:00.408882', 'step': 1102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:55:00.484730', 'step': 1102, 'epoch': 1} {'type': 'loss', 'content': 0.01404650043696165, 'timestamp': '2025-09-04 03:55:00.498048', 'step': 1103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:55:00.590116', 'step': 1103, 'epoch': 1} {'type': 'loss', 'content': 0.06355747580528259, 'timestamp': '2025-09-04 03:55:00.607459', 'step': 1104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:55:00.714230', 'step': 1104, 'epoch': 1} {'type': 'loss', 'content': 0.02257397770881653, 'timestamp': '2025-09-04 03:55:00.736553', 'step': 1105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:00.835157', 'step': 1105, 'epoch': 1} {'type': 'loss', 'content': 0.03384973481297493, 'timestamp': '2025-09-04 03:55:00.852475', 'step': 1106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:00.956418', 'step': 1106, 'epoch': 1} {'type': 'loss', 'content': 0.001299434108659625, 'timestamp': '2025-09-04 03:55:00.975594', 'step': 1107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:55:01.051270', 'step': 1107, 'epoch': 1} {'type': 'loss', 'content': 0.02580624632537365, 'timestamp': '2025-09-04 03:55:01.065411', 'step': 1108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:55:01.157686', 'step': 1108, 'epoch': 1} {'type': 'loss', 'content': 0.04326159134507179, 'timestamp': '2025-09-04 03:55:01.176243', 'step': 1109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:01.277634', 'step': 1109, 'epoch': 1} {'type': 'loss', 'content': 0.05120621249079704, 'timestamp': '2025-09-04 03:55:01.296313', 'step': 1110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:01.399016', 'step': 1110, 'epoch': 1} {'type': 'loss', 'content': 0.0068900627084076405, 'timestamp': '2025-09-04 03:55:01.417975', 'step': 1111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:01.528856', 'step': 1111, 'epoch': 1} {'type': 'loss', 'content': 0.018264418467879295, 'timestamp': '2025-09-04 03:55:01.550027', 'step': 1112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:55:01.630695', 'step': 1112, 'epoch': 1} {'type': 'loss', 'content': 0.022158373147249222, 'timestamp': '2025-09-04 03:55:01.647141', 'step': 1113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 03:55:01.818413', 'step': 1113, 'epoch': 1} {'type': 'loss', 'content': 0.027837276458740234, 'timestamp': '2025-09-04 03:55:01.853035', 'step': 1114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:01.955510', 'step': 1114, 'epoch': 1} {'type': 'loss', 'content': 0.044819965958595276, 'timestamp': '2025-09-04 03:55:01.974485', 'step': 1115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:02.076943', 'step': 1115, 'epoch': 1} {'type': 'loss', 'content': 0.040360935032367706, 'timestamp': '2025-09-04 03:55:02.096612', 'step': 1116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:02.197548', 'step': 1116, 'epoch': 1} {'type': 'loss', 'content': 0.030223630368709564, 'timestamp': '2025-09-04 03:55:02.218582', 'step': 1117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:55:02.293568', 'step': 1117, 'epoch': 1} {'type': 'loss', 'content': 0.04847509413957596, 'timestamp': '2025-09-04 03:55:02.306906', 'step': 1118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:02.401949', 'step': 1118, 'epoch': 1} {'type': 'loss', 'content': 0.0198849868029356, 'timestamp': '2025-09-04 03:55:02.419194', 'step': 1119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:02.504757', 'step': 1119, 'epoch': 1} {'type': 'loss', 'content': 0.0160811897367239, 'timestamp': '2025-09-04 03:55:02.520772', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:10.996486', 'step': 1120, 'epoch': 1} {'type': 'pplx', 'content': 333.6826089296669, 'timestamp': '2025-09-04 03:55:10.998830', 'step': 1120, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1120', 'timestamp': '2025-09-04 03:55:11.503767', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:55:11.577559', 'step': 1120, 'epoch': 1} {'type': 'loss', 'content': 0.047653671354055405, 'timestamp': '2025-09-04 03:55:11.592267', 'step': 1121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:55:11.688189', 'step': 1121, 'epoch': 1} {'type': 'loss', 'content': 0.02942013181746006, 'timestamp': '2025-09-04 03:55:11.705321', 'step': 1122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:11.811599', 'step': 1122, 'epoch': 1} {'type': 'loss', 'content': 0.04028499498963356, 'timestamp': '2025-09-04 03:55:11.830641', 'step': 1123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:11.919412', 'step': 1123, 'epoch': 1} {'type': 'loss', 'content': 0.029223607853055, 'timestamp': '2025-09-04 03:55:11.935618', 'step': 1124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1232], 'flops': 24640149647168.0}, 'timestamp': '2025-09-04 03:55:12.117352', 'step': 1124, 'epoch': 1} {'type': 'loss', 'content': 0.100711390376091, 'timestamp': '2025-09-04 03:55:12.154447', 'step': 1125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:12.241477', 'step': 1125, 'epoch': 1} {'type': 'loss', 'content': 0.009318535216152668, 'timestamp': '2025-09-04 03:55:12.256752', 'step': 1126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:12.361415', 'step': 1126, 'epoch': 1} {'type': 'loss', 'content': 0.049649372696876526, 'timestamp': '2025-09-04 03:55:12.380493', 'step': 1127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:12.491512', 'step': 1127, 'epoch': 1} {'type': 'loss', 'content': 0.015808911994099617, 'timestamp': '2025-09-04 03:55:12.510709', 'step': 1128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 03:55:12.646841', 'step': 1128, 'epoch': 1} {'type': 'loss', 'content': 0.010250390507280827, 'timestamp': '2025-09-04 03:55:12.675085', 'step': 1129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:12.763963', 'step': 1129, 'epoch': 1} {'type': 'loss', 'content': 0.0313245952129364, 'timestamp': '2025-09-04 03:55:12.779365', 'step': 1130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:55:12.893061', 'step': 1130, 'epoch': 1} {'type': 'loss', 'content': 0.10398827493190765, 'timestamp': '2025-09-04 03:55:12.912994', 'step': 1131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:13.017369', 'step': 1131, 'epoch': 1} {'type': 'loss', 'content': 0.03392321988940239, 'timestamp': '2025-09-04 03:55:13.036669', 'step': 1132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:55:13.126425', 'step': 1132, 'epoch': 1} {'type': 'loss', 'content': 0.008513440378010273, 'timestamp': '2025-09-04 03:55:13.144531', 'step': 1133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:13.247351', 'step': 1133, 'epoch': 1} {'type': 'loss', 'content': 0.0382850281894207, 'timestamp': '2025-09-04 03:55:13.265558', 'step': 1134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:55:13.344506', 'step': 1134, 'epoch': 1} {'type': 'loss', 'content': 0.07204016298055649, 'timestamp': '2025-09-04 03:55:13.358254', 'step': 1135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:13.444419', 'step': 1135, 'epoch': 1} {'type': 'loss', 'content': 0.014276178553700447, 'timestamp': '2025-09-04 03:55:13.460401', 'step': 1136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:13.561239', 'step': 1136, 'epoch': 1} {'type': 'loss', 'content': 0.0037956838496029377, 'timestamp': '2025-09-04 03:55:13.580737', 'step': 1137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:13.689193', 'step': 1137, 'epoch': 1} {'type': 'loss', 'content': 0.005481324158608913, 'timestamp': '2025-09-04 03:55:13.707893', 'step': 1138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:13.809507', 'step': 1138, 'epoch': 1} {'type': 'loss', 'content': 0.0013551759766414762, 'timestamp': '2025-09-04 03:55:13.827493', 'step': 1139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:55:13.967598', 'step': 1139, 'epoch': 1} {'type': 'loss', 'content': 0.0756765827536583, 'timestamp': '2025-09-04 03:55:13.994045', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:22.505779', 'step': 1140, 'epoch': 1} {'type': 'pplx', 'content': 342.1746491135325, 'timestamp': '2025-09-04 03:55:22.508181', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:55:22.625590', 'step': 1140, 'epoch': 1} {'type': 'loss', 'content': 0.01116140466183424, 'timestamp': '2025-09-04 03:55:22.651011', 'step': 1141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:55:22.855245', 'step': 1141, 'epoch': 1} {'type': 'loss', 'content': 0.031512316316366196, 'timestamp': '2025-09-04 03:55:22.894487', 'step': 1142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:23.005275', 'step': 1142, 'epoch': 1} {'type': 'loss', 'content': 0.010817242786288261, 'timestamp': '2025-09-04 03:55:23.025885', 'step': 1143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:23.122786', 'step': 1143, 'epoch': 1} {'type': 'loss', 'content': 0.011584990657866001, 'timestamp': '2025-09-04 03:55:23.141003', 'step': 1144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:55:23.232853', 'step': 1144, 'epoch': 1} {'type': 'loss', 'content': 0.01660754159092903, 'timestamp': '2025-09-04 03:55:23.251496', 'step': 1145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:55:23.362256', 'step': 1145, 'epoch': 1} {'type': 'loss', 'content': 0.042043667286634445, 'timestamp': '2025-09-04 03:55:23.382433', 'step': 1146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:23.491895', 'step': 1146, 'epoch': 1} {'type': 'loss', 'content': 0.04430728405714035, 'timestamp': '2025-09-04 03:55:23.512635', 'step': 1147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:23.616816', 'step': 1147, 'epoch': 1} {'type': 'loss', 'content': 0.04766961559653282, 'timestamp': '2025-09-04 03:55:23.636716', 'step': 1148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:23.729421', 'step': 1148, 'epoch': 1} {'type': 'loss', 'content': 0.02647322788834572, 'timestamp': '2025-09-04 03:55:23.748507', 'step': 1149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:23.833529', 'step': 1149, 'epoch': 1} {'type': 'loss', 'content': 0.008523870259523392, 'timestamp': '2025-09-04 03:55:23.849020', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:55:23.939069', 'step': 1150, 'epoch': 1} {'type': 'loss', 'content': 0.03426166996359825, 'timestamp': '2025-09-04 03:55:23.955715', 'step': 1151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:24.040437', 'step': 1151, 'epoch': 1} {'type': 'loss', 'content': 0.0056257485412061214, 'timestamp': '2025-09-04 03:55:24.056529', 'step': 1152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:55:24.148754', 'step': 1152, 'epoch': 1} {'type': 'loss', 'content': 0.03579988703131676, 'timestamp': '2025-09-04 03:55:24.167610', 'step': 1153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:24.270568', 'step': 1153, 'epoch': 1} {'type': 'loss', 'content': 0.021155666559934616, 'timestamp': '2025-09-04 03:55:24.289678', 'step': 1154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:24.400847', 'step': 1154, 'epoch': 1} {'type': 'loss', 'content': 0.020209012553095818, 'timestamp': '2025-09-04 03:55:24.421226', 'step': 1155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:24.530289', 'step': 1155, 'epoch': 1} {'type': 'loss', 'content': 0.08458837121725082, 'timestamp': '2025-09-04 03:55:24.550087', 'step': 1156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:24.632987', 'step': 1156, 'epoch': 1} {'type': 'loss', 'content': 0.04702272638678551, 'timestamp': '2025-09-04 03:55:24.649804', 'step': 1157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:55:24.772244', 'step': 1157, 'epoch': 1} {'type': 'loss', 'content': 0.03629223629832268, 'timestamp': '2025-09-04 03:55:24.795123', 'step': 1158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:24.906181', 'step': 1158, 'epoch': 1} {'type': 'loss', 'content': 0.010823562741279602, 'timestamp': '2025-09-04 03:55:24.926963', 'step': 1159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:25.026738', 'step': 1159, 'epoch': 1} {'type': 'loss', 'content': 0.015217112377285957, 'timestamp': '2025-09-04 03:55:25.045911', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:33.427667', 'step': 1160, 'epoch': 1} {'type': 'pplx', 'content': 348.55127656576644, 'timestamp': '2025-09-04 03:55:33.429586', 'step': 1160, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1160', 'timestamp': '2025-09-04 03:55:33.897323', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:55:34.003968', 'step': 1160, 'epoch': 1} {'type': 'loss', 'content': 0.05122144892811775, 'timestamp': '2025-09-04 03:55:34.026479', 'step': 1161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:34.128682', 'step': 1161, 'epoch': 1} {'type': 'loss', 'content': 0.02695164829492569, 'timestamp': '2025-09-04 03:55:34.147589', 'step': 1162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:55:34.226537', 'step': 1162, 'epoch': 1} {'type': 'loss', 'content': 0.07939313352108002, 'timestamp': '2025-09-04 03:55:34.240538', 'step': 1163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:34.347710', 'step': 1163, 'epoch': 1} {'type': 'loss', 'content': 0.032340407371520996, 'timestamp': '2025-09-04 03:55:34.367546', 'step': 1164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:34.468983', 'step': 1164, 'epoch': 1} {'type': 'loss', 'content': 0.04987955838441849, 'timestamp': '2025-09-04 03:55:34.489115', 'step': 1165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:55:34.582559', 'step': 1165, 'epoch': 1} {'type': 'loss', 'content': 0.005085882265120745, 'timestamp': '2025-09-04 03:55:34.599866', 'step': 1166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 03:55:34.745576', 'step': 1166, 'epoch': 1} {'type': 'loss', 'content': 0.028625115752220154, 'timestamp': '2025-09-04 03:55:34.773627', 'step': 1167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 03:55:34.844590', 'step': 1167, 'epoch': 1} {'type': 'loss', 'content': 0.04103225842118263, 'timestamp': '2025-09-04 03:55:34.857925', 'step': 1168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:34.960699', 'step': 1168, 'epoch': 1} {'type': 'loss', 'content': 0.029832925647497177, 'timestamp': '2025-09-04 03:55:34.981572', 'step': 1169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:55:35.091917', 'step': 1169, 'epoch': 1} {'type': 'loss', 'content': 0.006762138567864895, 'timestamp': '2025-09-04 03:55:35.112008', 'step': 1170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 03:55:35.274883', 'step': 1170, 'epoch': 1} {'type': 'loss', 'content': 0.04091715067625046, 'timestamp': '2025-09-04 03:55:35.306668', 'step': 1171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:55:35.390858', 'step': 1171, 'epoch': 1} {'type': 'loss', 'content': 0.07869014889001846, 'timestamp': '2025-09-04 03:55:35.406503', 'step': 1172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:35.507835', 'step': 1172, 'epoch': 1} {'type': 'loss', 'content': 0.07957983762025833, 'timestamp': '2025-09-04 03:55:35.528848', 'step': 1173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 03:55:35.598923', 'step': 1173, 'epoch': 1} {'type': 'loss', 'content': 0.00997950043529272, 'timestamp': '2025-09-04 03:55:35.611323', 'step': 1174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:35.722162', 'step': 1174, 'epoch': 1} {'type': 'loss', 'content': 0.04756058380007744, 'timestamp': '2025-09-04 03:55:35.742478', 'step': 1175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:35.829004', 'step': 1175, 'epoch': 1} {'type': 'loss', 'content': 0.046221889555454254, 'timestamp': '2025-09-04 03:55:35.845178', 'step': 1176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:35.946113', 'step': 1176, 'epoch': 1} {'type': 'loss', 'content': 0.030887359753251076, 'timestamp': '2025-09-04 03:55:35.966861', 'step': 1177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:36.067236', 'step': 1177, 'epoch': 1} {'type': 'loss', 'content': 0.03892279416322708, 'timestamp': '2025-09-04 03:55:36.085561', 'step': 1178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:55:36.164518', 'step': 1178, 'epoch': 1} {'type': 'loss', 'content': 0.007792294956743717, 'timestamp': '2025-09-04 03:55:36.178652', 'step': 1179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:55:36.288591', 'step': 1179, 'epoch': 1} {'type': 'loss', 'content': 0.05254804715514183, 'timestamp': '2025-09-04 03:55:36.309426', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:44.707890', 'step': 1180, 'epoch': 1} {'type': 'pplx', 'content': 353.37533564087937, 'timestamp': '2025-09-04 03:55:44.709798', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:44.791751', 'step': 1180, 'epoch': 1} {'type': 'loss', 'content': 0.028075462207198143, 'timestamp': '2025-09-04 03:55:44.808892', 'step': 1181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:44.904297', 'step': 1181, 'epoch': 1} {'type': 'loss', 'content': 0.062234699726104736, 'timestamp': '2025-09-04 03:55:44.921666', 'step': 1182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:55:45.029947', 'step': 1182, 'epoch': 1} {'type': 'loss', 'content': 0.05085289850831032, 'timestamp': '2025-09-04 03:55:45.050048', 'step': 1183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:55:45.126108', 'step': 1183, 'epoch': 1} {'type': 'loss', 'content': 0.03087581880390644, 'timestamp': '2025-09-04 03:55:45.140649', 'step': 1184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:55:45.243704', 'step': 1184, 'epoch': 1} {'type': 'loss', 'content': 0.008511030115187168, 'timestamp': '2025-09-04 03:55:45.265511', 'step': 1185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:45.377248', 'step': 1185, 'epoch': 1} {'type': 'loss', 'content': 0.06436464190483093, 'timestamp': '2025-09-04 03:55:45.397541', 'step': 1186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:55:45.493257', 'step': 1186, 'epoch': 1} {'type': 'loss', 'content': 0.01800878904759884, 'timestamp': '2025-09-04 03:55:45.510182', 'step': 1187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:45.611829', 'step': 1187, 'epoch': 1} {'type': 'loss', 'content': 0.004938524682074785, 'timestamp': '2025-09-04 03:55:45.631169', 'step': 1188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1088], 'flops': 21760132163840.0}, 'timestamp': '2025-09-04 03:55:45.785314', 'step': 1188, 'epoch': 1} {'type': 'loss', 'content': 0.03547127917408943, 'timestamp': '2025-09-04 03:55:45.818526', 'step': 1189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:45.923937', 'step': 1189, 'epoch': 1} {'type': 'loss', 'content': 0.04392065852880478, 'timestamp': '2025-09-04 03:55:45.943073', 'step': 1190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:55:46.044602', 'step': 1190, 'epoch': 1} {'type': 'loss', 'content': 0.005546086002141237, 'timestamp': '2025-09-04 03:55:46.061763', 'step': 1191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:55:46.173039', 'step': 1191, 'epoch': 1} {'type': 'loss', 'content': 0.019072137773036957, 'timestamp': '2025-09-04 03:55:46.193789', 'step': 1192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:55:46.285071', 'step': 1192, 'epoch': 1} {'type': 'loss', 'content': 0.018132086843252182, 'timestamp': '2025-09-04 03:55:46.301809', 'step': 1193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:46.392004', 'step': 1193, 'epoch': 1} {'type': 'loss', 'content': 0.04924427345395088, 'timestamp': '2025-09-04 03:55:46.407467', 'step': 1194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:46.510364', 'step': 1194, 'epoch': 1} {'type': 'loss', 'content': 0.03797117620706558, 'timestamp': '2025-09-04 03:55:46.528840', 'step': 1195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:55:46.627211', 'step': 1195, 'epoch': 1} {'type': 'loss', 'content': 0.040146905928850174, 'timestamp': '2025-09-04 03:55:46.644773', 'step': 1196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:46.746916', 'step': 1196, 'epoch': 1} {'type': 'loss', 'content': 0.03447960317134857, 'timestamp': '2025-09-04 03:55:46.767915', 'step': 1197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:55:46.890588', 'step': 1197, 'epoch': 1} {'type': 'loss', 'content': 0.019626695662736893, 'timestamp': '2025-09-04 03:55:46.911658', 'step': 1198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:55:47.004148', 'step': 1198, 'epoch': 1} {'type': 'loss', 'content': 0.00873672403395176, 'timestamp': '2025-09-04 03:55:47.020696', 'step': 1199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:55:47.128916', 'step': 1199, 'epoch': 1} {'type': 'loss', 'content': 0.06406814604997635, 'timestamp': '2025-09-04 03:55:47.146364', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:55:55.666983', 'step': 1200, 'epoch': 1} {'type': 'pplx', 'content': 351.81034139828853, 'timestamp': '2025-09-04 03:55:55.668968', 'step': 1200, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1200', 'timestamp': '2025-09-04 03:55:56.192955', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 03:55:56.257064', 'step': 1200, 'epoch': 1} {'type': 'loss', 'content': 0.00952016282826662, 'timestamp': '2025-09-04 03:55:56.268026', 'step': 1201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:55:56.351897', 'step': 1201, 'epoch': 1} {'type': 'loss', 'content': 0.014243190176784992, 'timestamp': '2025-09-04 03:55:56.366624', 'step': 1202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 03:55:56.497096', 'step': 1202, 'epoch': 1} {'type': 'loss', 'content': 0.05320248380303383, 'timestamp': '2025-09-04 03:55:56.520822', 'step': 1203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:56.610589', 'step': 1203, 'epoch': 1} {'type': 'loss', 'content': 0.05997435748577118, 'timestamp': '2025-09-04 03:55:56.626370', 'step': 1204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:56.741582', 'step': 1204, 'epoch': 1} {'type': 'loss', 'content': 0.037900906056165695, 'timestamp': '2025-09-04 03:55:56.763717', 'step': 1205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:55:56.874415', 'step': 1205, 'epoch': 1} {'type': 'loss', 'content': 0.06352101266384125, 'timestamp': '2025-09-04 03:55:56.894641', 'step': 1206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:57.001142', 'step': 1206, 'epoch': 1} {'type': 'loss', 'content': 0.04056302830576897, 'timestamp': '2025-09-04 03:55:57.020228', 'step': 1207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:57.123990', 'step': 1207, 'epoch': 1} {'type': 'loss', 'content': 0.02337026037275791, 'timestamp': '2025-09-04 03:55:57.143031', 'step': 1208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:55:57.222919', 'step': 1208, 'epoch': 1} {'type': 'loss', 'content': 0.022142881527543068, 'timestamp': '2025-09-04 03:55:57.237961', 'step': 1209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:55:57.353958', 'step': 1209, 'epoch': 1} {'type': 'loss', 'content': 0.008503721095621586, 'timestamp': '2025-09-04 03:55:57.374604', 'step': 1210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:55:57.481228', 'step': 1210, 'epoch': 1} {'type': 'loss', 'content': 0.024838682264089584, 'timestamp': '2025-09-04 03:55:57.500077', 'step': 1211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:57.607060', 'step': 1211, 'epoch': 1} {'type': 'loss', 'content': 0.016720108687877655, 'timestamp': '2025-09-04 03:55:57.626884', 'step': 1212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:55:57.716274', 'step': 1212, 'epoch': 1} {'type': 'loss', 'content': 0.03313479945063591, 'timestamp': '2025-09-04 03:55:57.734459', 'step': 1213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:57.839337', 'step': 1213, 'epoch': 1} {'type': 'loss', 'content': 0.03852255269885063, 'timestamp': '2025-09-04 03:55:57.858178', 'step': 1214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:57.959198', 'step': 1214, 'epoch': 1} {'type': 'loss', 'content': 0.030346812680363655, 'timestamp': '2025-09-04 03:55:57.978050', 'step': 1215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:55:58.080948', 'step': 1215, 'epoch': 1} {'type': 'loss', 'content': 0.013426399789750576, 'timestamp': '2025-09-04 03:55:58.100720', 'step': 1216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:55:58.219568', 'step': 1216, 'epoch': 1} {'type': 'loss', 'content': 0.007183226756751537, 'timestamp': '2025-09-04 03:55:58.244859', 'step': 1217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:55:58.331516', 'step': 1217, 'epoch': 1} {'type': 'loss', 'content': 0.024007325991988182, 'timestamp': '2025-09-04 03:55:58.346913', 'step': 1218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:55:58.446232', 'step': 1218, 'epoch': 1} {'type': 'loss', 'content': 0.01614953577518463, 'timestamp': '2025-09-04 03:55:58.464742', 'step': 1219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:55:58.566439', 'step': 1219, 'epoch': 1} {'type': 'loss', 'content': 0.05485903471708298, 'timestamp': '2025-09-04 03:55:58.585891', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:56:07.237314', 'step': 1220, 'epoch': 1} {'type': 'pplx', 'content': 343.1063356120184, 'timestamp': '2025-09-04 03:56:07.239254', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:07.337811', 'step': 1220, 'epoch': 1} {'type': 'loss', 'content': 0.009709849022328854, 'timestamp': '2025-09-04 03:56:07.358941', 'step': 1221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:07.453863', 'step': 1221, 'epoch': 1} {'type': 'loss', 'content': 0.0713476687669754, 'timestamp': '2025-09-04 03:56:07.471290', 'step': 1222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:07.574956', 'step': 1222, 'epoch': 1} {'type': 'loss', 'content': 0.014135504141449928, 'timestamp': '2025-09-04 03:56:07.594208', 'step': 1223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:07.694439', 'step': 1223, 'epoch': 1} {'type': 'loss', 'content': 0.08039960265159607, 'timestamp': '2025-09-04 03:56:07.713550', 'step': 1224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:07.811723', 'step': 1224, 'epoch': 1} {'type': 'loss', 'content': 0.006991466507315636, 'timestamp': '2025-09-04 03:56:07.832184', 'step': 1225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:07.935653', 'step': 1225, 'epoch': 1} {'type': 'loss', 'content': 0.003120355773717165, 'timestamp': '2025-09-04 03:56:07.954778', 'step': 1226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:08.063069', 'step': 1226, 'epoch': 1} {'type': 'loss', 'content': 0.03354727104306221, 'timestamp': '2025-09-04 03:56:08.083308', 'step': 1227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:56:08.199707', 'step': 1227, 'epoch': 1} {'type': 'loss', 'content': 0.004579135682433844, 'timestamp': '2025-09-04 03:56:08.222359', 'step': 1228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:08.315258', 'step': 1228, 'epoch': 1} {'type': 'loss', 'content': 0.010765165090560913, 'timestamp': '2025-09-04 03:56:08.334373', 'step': 1229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:56:08.413365', 'step': 1229, 'epoch': 1} {'type': 'loss', 'content': 0.03472265228629112, 'timestamp': '2025-09-04 03:56:08.427468', 'step': 1230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:56:08.532931', 'step': 1230, 'epoch': 1} {'type': 'loss', 'content': 0.028265012428164482, 'timestamp': '2025-09-04 03:56:08.552694', 'step': 1231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:08.668961', 'step': 1231, 'epoch': 1} {'type': 'loss', 'content': 0.06229390949010849, 'timestamp': '2025-09-04 03:56:08.688691', 'step': 1232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:56:08.771497', 'step': 1232, 'epoch': 1} {'type': 'loss', 'content': 0.013525002636015415, 'timestamp': '2025-09-04 03:56:08.787962', 'step': 1233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:56:08.905323', 'step': 1233, 'epoch': 1} {'type': 'loss', 'content': 0.011920424178242683, 'timestamp': '2025-09-04 03:56:08.927293', 'step': 1234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:56:09.003578', 'step': 1234, 'epoch': 1} {'type': 'loss', 'content': 0.01954522728919983, 'timestamp': '2025-09-04 03:56:09.017136', 'step': 1235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:56:09.124546', 'step': 1235, 'epoch': 1} {'type': 'loss', 'content': 0.002615840407088399, 'timestamp': '2025-09-04 03:56:09.145091', 'step': 1236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:56:09.253716', 'step': 1236, 'epoch': 1} {'type': 'loss', 'content': 0.051942598074674606, 'timestamp': '2025-09-04 03:56:09.276397', 'step': 1237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:09.370402', 'step': 1237, 'epoch': 1} {'type': 'loss', 'content': 0.006919113453477621, 'timestamp': '2025-09-04 03:56:09.387324', 'step': 1238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:56:09.472227', 'step': 1238, 'epoch': 1} {'type': 'loss', 'content': 0.11789951473474503, 'timestamp': '2025-09-04 03:56:09.487161', 'step': 1239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:09.580412', 'step': 1239, 'epoch': 1} {'type': 'loss', 'content': 0.031079446896910667, 'timestamp': '2025-09-04 03:56:09.598105', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:56:17.965019', 'step': 1240, 'epoch': 1} {'type': 'pplx', 'content': 339.34815384838885, 'timestamp': '2025-09-04 03:56:17.966998', 'step': 1240, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1240', 'timestamp': '2025-09-04 03:56:18.478463', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:56:18.583380', 'step': 1240, 'epoch': 1} {'type': 'loss', 'content': 0.046426158398389816, 'timestamp': '2025-09-04 03:56:18.605824', 'step': 1241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:56:18.716749', 'step': 1241, 'epoch': 1} {'type': 'loss', 'content': 0.009461128152906895, 'timestamp': '2025-09-04 03:56:18.737124', 'step': 1242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:18.836445', 'step': 1242, 'epoch': 1} {'type': 'loss', 'content': 0.02478550747036934, 'timestamp': '2025-09-04 03:56:18.853794', 'step': 1243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 03:56:18.924573', 'step': 1243, 'epoch': 1} {'type': 'loss', 'content': 0.021543040871620178, 'timestamp': '2025-09-04 03:56:18.937862', 'step': 1244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:56:19.022085', 'step': 1244, 'epoch': 1} {'type': 'loss', 'content': 0.0028493471909314394, 'timestamp': '2025-09-04 03:56:19.038702', 'step': 1245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:19.147377', 'step': 1245, 'epoch': 1} {'type': 'loss', 'content': 0.03405190631747246, 'timestamp': '2025-09-04 03:56:19.165832', 'step': 1246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:19.266960', 'step': 1246, 'epoch': 1} {'type': 'loss', 'content': 0.04478135332465172, 'timestamp': '2025-09-04 03:56:19.285942', 'step': 1247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:19.388388', 'step': 1247, 'epoch': 1} {'type': 'loss', 'content': 0.06943120062351227, 'timestamp': '2025-09-04 03:56:19.408022', 'step': 1248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:19.508182', 'step': 1248, 'epoch': 1} {'type': 'loss', 'content': 0.01673070713877678, 'timestamp': '2025-09-04 03:56:19.528349', 'step': 1249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:56:19.635364', 'step': 1249, 'epoch': 1} {'type': 'loss', 'content': 0.04285313934087753, 'timestamp': '2025-09-04 03:56:19.655112', 'step': 1250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:19.758825', 'step': 1250, 'epoch': 1} {'type': 'loss', 'content': 0.007856685668230057, 'timestamp': '2025-09-04 03:56:19.777910', 'step': 1251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:56:19.852320', 'step': 1251, 'epoch': 1} {'type': 'loss', 'content': 0.033521927893161774, 'timestamp': '2025-09-04 03:56:19.866604', 'step': 1252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:19.958870', 'step': 1252, 'epoch': 1} {'type': 'loss', 'content': 0.05988616868853569, 'timestamp': '2025-09-04 03:56:19.977743', 'step': 1253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:56:20.083349', 'step': 1253, 'epoch': 1} {'type': 'loss', 'content': 0.05120409280061722, 'timestamp': '2025-09-04 03:56:20.103094', 'step': 1254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:56:20.188253', 'step': 1254, 'epoch': 1} {'type': 'loss', 'content': 0.03182673081755638, 'timestamp': '2025-09-04 03:56:20.203436', 'step': 1255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:20.296178', 'step': 1255, 'epoch': 1} {'type': 'loss', 'content': 0.03344005346298218, 'timestamp': '2025-09-04 03:56:20.313868', 'step': 1256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:56:20.402001', 'step': 1256, 'epoch': 1} {'type': 'loss', 'content': 0.0048216646537184715, 'timestamp': '2025-09-04 03:56:20.420176', 'step': 1257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:56:20.501654', 'step': 1257, 'epoch': 1} {'type': 'loss', 'content': 0.04110339656472206, 'timestamp': '2025-09-04 03:56:20.515279', 'step': 1258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:20.616816', 'step': 1258, 'epoch': 1} {'type': 'loss', 'content': 0.0046073743142187595, 'timestamp': '2025-09-04 03:56:20.635877', 'step': 1259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:20.738003', 'step': 1259, 'epoch': 1} {'type': 'loss', 'content': 0.009192178025841713, 'timestamp': '2025-09-04 03:56:20.757753', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:56:29.142086', 'step': 1260, 'epoch': 1} {'type': 'pplx', 'content': 340.13528680803216, 'timestamp': '2025-09-04 03:56:29.144161', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:56:29.223599', 'step': 1260, 'epoch': 1} {'type': 'loss', 'content': 0.03510812297463417, 'timestamp': '2025-09-04 03:56:29.240057', 'step': 1261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:29.348362', 'step': 1261, 'epoch': 1} {'type': 'loss', 'content': 0.04541385546326637, 'timestamp': '2025-09-04 03:56:29.368470', 'step': 1262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:29.470427', 'step': 1262, 'epoch': 1} {'type': 'loss', 'content': 0.021681929007172585, 'timestamp': '2025-09-04 03:56:29.489312', 'step': 1263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:29.582465', 'step': 1263, 'epoch': 1} {'type': 'loss', 'content': 0.014316936954855919, 'timestamp': '2025-09-04 03:56:29.600152', 'step': 1264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:29.704522', 'step': 1264, 'epoch': 1} {'type': 'loss', 'content': 0.023813286796212196, 'timestamp': '2025-09-04 03:56:29.726481', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:56:29.837560', 'step': 1265, 'epoch': 1} {'type': 'loss', 'content': 0.06364905834197998, 'timestamp': '2025-09-04 03:56:29.857947', 'step': 1266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 03:56:30.003616', 'step': 1266, 'epoch': 1} {'type': 'loss', 'content': 0.02237660065293312, 'timestamp': '2025-09-04 03:56:30.031688', 'step': 1267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:30.132118', 'step': 1267, 'epoch': 1} {'type': 'loss', 'content': 0.050466664135456085, 'timestamp': '2025-09-04 03:56:30.151541', 'step': 1268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:30.241954', 'step': 1268, 'epoch': 1} {'type': 'loss', 'content': 0.01618255488574505, 'timestamp': '2025-09-04 03:56:30.260597', 'step': 1269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:56:30.344420', 'step': 1269, 'epoch': 1} {'type': 'loss', 'content': 0.16435541212558746, 'timestamp': '2025-09-04 03:56:30.359316', 'step': 1270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:30.461700', 'step': 1270, 'epoch': 1} {'type': 'loss', 'content': 0.03205430880188942, 'timestamp': '2025-09-04 03:56:30.480832', 'step': 1271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:56:30.558405', 'step': 1271, 'epoch': 1} {'type': 'loss', 'content': 0.06162261217832565, 'timestamp': '2025-09-04 03:56:30.572763', 'step': 1272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:56:30.656850', 'step': 1272, 'epoch': 1} {'type': 'loss', 'content': 0.023222552612423897, 'timestamp': '2025-09-04 03:56:30.673770', 'step': 1273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:30.781833', 'step': 1273, 'epoch': 1} {'type': 'loss', 'content': 0.06229608505964279, 'timestamp': '2025-09-04 03:56:30.801926', 'step': 1274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:56:30.910768', 'step': 1274, 'epoch': 1} {'type': 'loss', 'content': 0.0057587032206356525, 'timestamp': '2025-09-04 03:56:30.931221', 'step': 1275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:31.022716', 'step': 1275, 'epoch': 1} {'type': 'loss', 'content': 0.026004638522863388, 'timestamp': '2025-09-04 03:56:31.040502', 'step': 1276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:31.130737', 'step': 1276, 'epoch': 1} {'type': 'loss', 'content': 0.03231319040060043, 'timestamp': '2025-09-04 03:56:31.149923', 'step': 1277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:31.251356', 'step': 1277, 'epoch': 1} {'type': 'loss', 'content': 0.02814597263932228, 'timestamp': '2025-09-04 03:56:31.270274', 'step': 1278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:56:31.344681', 'step': 1278, 'epoch': 1} {'type': 'loss', 'content': 0.04910106584429741, 'timestamp': '2025-09-04 03:56:31.358005', 'step': 1279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:56:31.450944', 'step': 1279, 'epoch': 1} {'type': 'loss', 'content': 0.05825873091816902, 'timestamp': '2025-09-04 03:56:31.467043', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:56:39.844352', 'step': 1280, 'epoch': 1} {'type': 'pplx', 'content': 340.68138853975984, 'timestamp': '2025-09-04 03:56:39.846112', 'step': 1280, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1280', 'timestamp': '2025-09-04 03:56:40.198819', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:40.296797', 'step': 1280, 'epoch': 1} {'type': 'loss', 'content': 0.010946203954517841, 'timestamp': '2025-09-04 03:56:40.317552', 'step': 1281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:56:40.402157', 'step': 1281, 'epoch': 1} {'type': 'loss', 'content': 0.01738363690674305, 'timestamp': '2025-09-04 03:56:40.417412', 'step': 1282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:40.511099', 'step': 1282, 'epoch': 1} {'type': 'loss', 'content': 0.026985451579093933, 'timestamp': '2025-09-04 03:56:40.528410', 'step': 1283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:40.621743', 'step': 1283, 'epoch': 1} {'type': 'loss', 'content': 0.005689225625246763, 'timestamp': '2025-09-04 03:56:40.639547', 'step': 1284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:40.730975', 'step': 1284, 'epoch': 1} {'type': 'loss', 'content': 0.03335564583539963, 'timestamp': '2025-09-04 03:56:40.749866', 'step': 1285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:40.850254', 'step': 1285, 'epoch': 1} {'type': 'loss', 'content': 0.014564265497028828, 'timestamp': '2025-09-04 03:56:40.868661', 'step': 1286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:40.967605', 'step': 1286, 'epoch': 1} {'type': 'loss', 'content': 0.04401400312781334, 'timestamp': '2025-09-04 03:56:40.985931', 'step': 1287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:41.081033', 'step': 1287, 'epoch': 1} {'type': 'loss', 'content': 0.031287554651498795, 'timestamp': '2025-09-04 03:56:41.099129', 'step': 1288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:56:41.208052', 'step': 1288, 'epoch': 1} {'type': 'loss', 'content': 0.01182449609041214, 'timestamp': '2025-09-04 03:56:41.230594', 'step': 1289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:56:41.314338', 'step': 1289, 'epoch': 1} {'type': 'loss', 'content': 0.08080806583166122, 'timestamp': '2025-09-04 03:56:41.329180', 'step': 1290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:41.433827', 'step': 1290, 'epoch': 1} {'type': 'loss', 'content': 0.005769069772213697, 'timestamp': '2025-09-04 03:56:41.452928', 'step': 1291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:41.556488', 'step': 1291, 'epoch': 1} {'type': 'loss', 'content': 0.007577816490083933, 'timestamp': '2025-09-04 03:56:41.576092', 'step': 1292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:41.667203', 'step': 1292, 'epoch': 1} {'type': 'loss', 'content': 0.029849909245967865, 'timestamp': '2025-09-04 03:56:41.685829', 'step': 1293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:56:41.795843', 'step': 1293, 'epoch': 1} {'type': 'loss', 'content': 0.04431498050689697, 'timestamp': '2025-09-04 03:56:41.816259', 'step': 1294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:41.919388', 'step': 1294, 'epoch': 1} {'type': 'loss', 'content': 0.01207160297781229, 'timestamp': '2025-09-04 03:56:41.938071', 'step': 1295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 03:56:42.072302', 'step': 1295, 'epoch': 1} {'type': 'loss', 'content': 0.007530734874308109, 'timestamp': '2025-09-04 03:56:42.097544', 'step': 1296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:42.187551', 'step': 1296, 'epoch': 1} {'type': 'loss', 'content': 0.030552592128515244, 'timestamp': '2025-09-04 03:56:42.206373', 'step': 1297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:56:42.283972', 'step': 1297, 'epoch': 1} {'type': 'loss', 'content': 0.026529986411333084, 'timestamp': '2025-09-04 03:56:42.297966', 'step': 1298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:56:42.390767', 'step': 1298, 'epoch': 1} {'type': 'loss', 'content': 0.06174427270889282, 'timestamp': '2025-09-04 03:56:42.407715', 'step': 1299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:56:42.524343', 'step': 1299, 'epoch': 1} {'type': 'loss', 'content': 0.017831875011324883, 'timestamp': '2025-09-04 03:56:42.545459', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:56:50.958266', 'step': 1300, 'epoch': 1} {'type': 'pplx', 'content': 344.45454539575513, 'timestamp': '2025-09-04 03:56:50.960462', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:56:51.033210', 'step': 1300, 'epoch': 2} {'type': 'loss', 'content': 0.00863197073340416, 'timestamp': '2025-09-04 03:56:51.048203', 'step': 1301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:51.152787', 'step': 1301, 'epoch': 2} {'type': 'loss', 'content': 0.03207479417324066, 'timestamp': '2025-09-04 03:56:51.171776', 'step': 1302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:56:51.272855', 'step': 1302, 'epoch': 2} {'type': 'loss', 'content': 0.03501874580979347, 'timestamp': '2025-09-04 03:56:51.291496', 'step': 1303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:56:51.395438', 'step': 1303, 'epoch': 2} {'type': 'loss', 'content': 0.040076903998851776, 'timestamp': '2025-09-04 03:56:51.415398', 'step': 1304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:51.507169', 'step': 1304, 'epoch': 2} {'type': 'loss', 'content': 0.06259066611528397, 'timestamp': '2025-09-04 03:56:51.526093', 'step': 1305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:51.621169', 'step': 1305, 'epoch': 2} {'type': 'loss', 'content': 0.005843855440616608, 'timestamp': '2025-09-04 03:56:51.638364', 'step': 1306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:51.746720', 'step': 1306, 'epoch': 2} {'type': 'loss', 'content': 0.016627954319119453, 'timestamp': '2025-09-04 03:56:51.766813', 'step': 1307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:56:51.870051', 'step': 1307, 'epoch': 2} {'type': 'loss', 'content': 0.01708150841295719, 'timestamp': '2025-09-04 03:56:51.889764', 'step': 1308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:56:51.993489', 'step': 1308, 'epoch': 2} {'type': 'loss', 'content': 0.02616330236196518, 'timestamp': '2025-09-04 03:56:52.015383', 'step': 1309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:56:52.110091', 'step': 1309, 'epoch': 2} {'type': 'loss', 'content': 0.0031908308155834675, 'timestamp': '2025-09-04 03:56:52.127233', 'step': 1310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:56:52.199864', 'step': 1310, 'epoch': 2} {'type': 'loss', 'content': 0.03640008345246315, 'timestamp': '2025-09-04 03:56:52.212599', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:52.308612', 'step': 1311, 'epoch': 2} {'type': 'loss', 'content': 0.009561690501868725, 'timestamp': '2025-09-04 03:56:52.326635', 'step': 1312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:56:52.424790', 'step': 1312, 'epoch': 2} {'type': 'loss', 'content': 0.04150993376970291, 'timestamp': '2025-09-04 03:56:52.445011', 'step': 1313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:56:52.536780', 'step': 1313, 'epoch': 2} {'type': 'loss', 'content': 0.019684717059135437, 'timestamp': '2025-09-04 03:56:52.552190', 'step': 1314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:56:52.661011', 'step': 1314, 'epoch': 2} {'type': 'loss', 'content': 0.011558826081454754, 'timestamp': '2025-09-04 03:56:52.681084', 'step': 1315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:56:52.767800', 'step': 1315, 'epoch': 2} {'type': 'loss', 'content': 0.01043105311691761, 'timestamp': '2025-09-04 03:56:52.783965', 'step': 1316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:56:52.877263', 'step': 1316, 'epoch': 2} {'type': 'loss', 'content': 0.02803068794310093, 'timestamp': '2025-09-04 03:56:52.896318', 'step': 1317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:56:52.973511', 'step': 1317, 'epoch': 2} {'type': 'loss', 'content': 0.09199260920286179, 'timestamp': '2025-09-04 03:56:52.987265', 'step': 1318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:56:53.059699', 'step': 1318, 'epoch': 2} {'type': 'loss', 'content': 0.00931278895586729, 'timestamp': '2025-09-04 03:56:53.072457', 'step': 1319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:56:53.183382', 'step': 1319, 'epoch': 2} {'type': 'loss', 'content': 0.008658390492200851, 'timestamp': '2025-09-04 03:56:53.204794', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:01.599545', 'step': 1320, 'epoch': 2} {'type': 'pplx', 'content': 351.6464470611636, 'timestamp': '2025-09-04 03:57:01.601996', 'step': 1320, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1320', 'timestamp': '2025-09-04 03:57:02.093639', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:02.198545', 'step': 1320, 'epoch': 2} {'type': 'loss', 'content': 0.009841070510447025, 'timestamp': '2025-09-04 03:57:02.220801', 'step': 1321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1392], 'flops': 27840169073088.0}, 'timestamp': '2025-09-04 03:57:02.426115', 'step': 1321, 'epoch': 2} {'type': 'loss', 'content': 0.016447851434350014, 'timestamp': '2025-09-04 03:57:02.465621', 'step': 1322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:57:02.576052', 'step': 1322, 'epoch': 2} {'type': 'loss', 'content': 0.030630143359303474, 'timestamp': '2025-09-04 03:57:02.596668', 'step': 1323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 03:57:02.812115', 'step': 1323, 'epoch': 2} {'type': 'loss', 'content': 0.03536463528871536, 'timestamp': '2025-09-04 03:57:02.853461', 'step': 1324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:02.955808', 'step': 1324, 'epoch': 2} {'type': 'loss', 'content': 0.008404337801039219, 'timestamp': '2025-09-04 03:57:02.976945', 'step': 1325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:03.085024', 'step': 1325, 'epoch': 2} {'type': 'loss', 'content': 0.02297317609190941, 'timestamp': '2025-09-04 03:57:03.104900', 'step': 1326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:03.195185', 'step': 1326, 'epoch': 2} {'type': 'loss', 'content': 0.0200307946652174, 'timestamp': '2025-09-04 03:57:03.212065', 'step': 1327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:03.311518', 'step': 1327, 'epoch': 2} {'type': 'loss', 'content': 0.006862281356006861, 'timestamp': '2025-09-04 03:57:03.330810', 'step': 1328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:03.436212', 'step': 1328, 'epoch': 2} {'type': 'loss', 'content': 0.13922302424907684, 'timestamp': '2025-09-04 03:57:03.458300', 'step': 1329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:57:03.567713', 'step': 1329, 'epoch': 2} {'type': 'loss', 'content': 0.028604896739125252, 'timestamp': '2025-09-04 03:57:03.588279', 'step': 1330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:03.690531', 'step': 1330, 'epoch': 2} {'type': 'loss', 'content': 0.015768418088555336, 'timestamp': '2025-09-04 03:57:03.709730', 'step': 1331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:03.803908', 'step': 1331, 'epoch': 2} {'type': 'loss', 'content': 0.021719908341765404, 'timestamp': '2025-09-04 03:57:03.821947', 'step': 1332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:57:03.896565', 'step': 1332, 'epoch': 2} {'type': 'loss', 'content': 0.02307227812707424, 'timestamp': '2025-09-04 03:57:03.911261', 'step': 1333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:04.002837', 'step': 1333, 'epoch': 2} {'type': 'loss', 'content': 0.0028728533070534468, 'timestamp': '2025-09-04 03:57:04.019376', 'step': 1334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:04.122476', 'step': 1334, 'epoch': 2} {'type': 'loss', 'content': 0.03315138444304466, 'timestamp': '2025-09-04 03:57:04.141454', 'step': 1335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:04.236926', 'step': 1335, 'epoch': 2} {'type': 'loss', 'content': 0.008455898612737656, 'timestamp': '2025-09-04 03:57:04.255020', 'step': 1336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:04.361785', 'step': 1336, 'epoch': 2} {'type': 'loss', 'content': 0.1121997982263565, 'timestamp': '2025-09-04 03:57:04.384050', 'step': 1337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:04.497628', 'step': 1337, 'epoch': 2} {'type': 'loss', 'content': 0.017756765708327293, 'timestamp': '2025-09-04 03:57:04.517733', 'step': 1338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:57:04.603610', 'step': 1338, 'epoch': 2} {'type': 'loss', 'content': 0.00965754222124815, 'timestamp': '2025-09-04 03:57:04.618677', 'step': 1339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:57:04.696431', 'step': 1339, 'epoch': 2} {'type': 'loss', 'content': 0.02266879379749298, 'timestamp': '2025-09-04 03:57:04.711103', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:13.127580', 'step': 1340, 'epoch': 2} {'type': 'pplx', 'content': 356.8557867994047, 'timestamp': '2025-09-04 03:57:13.129660', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:57:13.211956', 'step': 1340, 'epoch': 2} {'type': 'loss', 'content': 0.028563033789396286, 'timestamp': '2025-09-04 03:57:13.228882', 'step': 1341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:57:13.330240', 'step': 1341, 'epoch': 2} {'type': 'loss', 'content': 0.04276951029896736, 'timestamp': '2025-09-04 03:57:13.349088', 'step': 1342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:13.455596', 'step': 1342, 'epoch': 2} {'type': 'loss', 'content': 0.017385128885507584, 'timestamp': '2025-09-04 03:57:13.475336', 'step': 1343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:13.559205', 'step': 1343, 'epoch': 2} {'type': 'loss', 'content': 0.009770605713129044, 'timestamp': '2025-09-04 03:57:13.574334', 'step': 1344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:57:13.650919', 'step': 1344, 'epoch': 2} {'type': 'loss', 'content': 0.01318974420428276, 'timestamp': '2025-09-04 03:57:13.665861', 'step': 1345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:13.774914', 'step': 1345, 'epoch': 2} {'type': 'loss', 'content': 0.004364403896033764, 'timestamp': '2025-09-04 03:57:13.795071', 'step': 1346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:13.899168', 'step': 1346, 'epoch': 2} {'type': 'loss', 'content': 0.007132431026548147, 'timestamp': '2025-09-04 03:57:13.918237', 'step': 1347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:14.009956', 'step': 1347, 'epoch': 2} {'type': 'loss', 'content': 0.012063604779541492, 'timestamp': '2025-09-04 03:57:14.027233', 'step': 1348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:14.125654', 'step': 1348, 'epoch': 2} {'type': 'loss', 'content': 0.020999517291784286, 'timestamp': '2025-09-04 03:57:14.145819', 'step': 1349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:14.239683', 'step': 1349, 'epoch': 2} {'type': 'loss', 'content': 0.007064122706651688, 'timestamp': '2025-09-04 03:57:14.256533', 'step': 1350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:14.359985', 'step': 1350, 'epoch': 2} {'type': 'loss', 'content': 0.00644453801214695, 'timestamp': '2025-09-04 03:57:14.378894', 'step': 1351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:14.465017', 'step': 1351, 'epoch': 2} {'type': 'loss', 'content': 0.004426640458405018, 'timestamp': '2025-09-04 03:57:14.480574', 'step': 1352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:14.586535', 'step': 1352, 'epoch': 2} {'type': 'loss', 'content': 0.00213529821485281, 'timestamp': '2025-09-04 03:57:14.608254', 'step': 1353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:14.703837', 'step': 1353, 'epoch': 2} {'type': 'loss', 'content': 0.009575147181749344, 'timestamp': '2025-09-04 03:57:14.721141', 'step': 1354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:14.812433', 'step': 1354, 'epoch': 2} {'type': 'loss', 'content': 0.009818262420594692, 'timestamp': '2025-09-04 03:57:14.829234', 'step': 1355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:14.907815', 'step': 1355, 'epoch': 2} {'type': 'loss', 'content': 0.006910801865160465, 'timestamp': '2025-09-04 03:57:14.922554', 'step': 1356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:57:15.020511', 'step': 1356, 'epoch': 2} {'type': 'loss', 'content': 0.025037406012415886, 'timestamp': '2025-09-04 03:57:15.040919', 'step': 1357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:15.141088', 'step': 1357, 'epoch': 2} {'type': 'loss', 'content': 0.058873310685157776, 'timestamp': '2025-09-04 03:57:15.159416', 'step': 1358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:15.257132', 'step': 1358, 'epoch': 2} {'type': 'loss', 'content': 0.008470847271382809, 'timestamp': '2025-09-04 03:57:15.274434', 'step': 1359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:15.380421', 'step': 1359, 'epoch': 2} {'type': 'loss', 'content': 0.038173649460077286, 'timestamp': '2025-09-04 03:57:15.400957', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:23.804518', 'step': 1360, 'epoch': 2} {'type': 'pplx', 'content': 358.21292183839546, 'timestamp': '2025-09-04 03:57:23.806482', 'step': 1360, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1360', 'timestamp': '2025-09-04 03:57:24.379494', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:24.471417', 'step': 1360, 'epoch': 2} {'type': 'loss', 'content': 0.003160255728289485, 'timestamp': '2025-09-04 03:57:24.490511', 'step': 1361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:57:24.600621', 'step': 1361, 'epoch': 2} {'type': 'loss', 'content': 0.01291283406317234, 'timestamp': '2025-09-04 03:57:24.620887', 'step': 1362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:24.712441', 'step': 1362, 'epoch': 2} {'type': 'loss', 'content': 0.01215626671910286, 'timestamp': '2025-09-04 03:57:24.729005', 'step': 1363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 03:57:24.949046', 'step': 1363, 'epoch': 2} {'type': 'loss', 'content': 0.007885631173849106, 'timestamp': '2025-09-04 03:57:24.992019', 'step': 1364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:57:25.085325', 'step': 1364, 'epoch': 2} {'type': 'loss', 'content': 0.009847918525338173, 'timestamp': '2025-09-04 03:57:25.104269', 'step': 1365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:25.212570', 'step': 1365, 'epoch': 2} {'type': 'loss', 'content': 0.056854456663131714, 'timestamp': '2025-09-04 03:57:25.232642', 'step': 1366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 03:57:25.361488', 'step': 1366, 'epoch': 2} {'type': 'loss', 'content': 0.05310133844614029, 'timestamp': '2025-09-04 03:57:25.384890', 'step': 1367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:25.492476', 'step': 1367, 'epoch': 2} {'type': 'loss', 'content': 0.014655319042503834, 'timestamp': '2025-09-04 03:57:25.512923', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:57:25.595251', 'step': 1368, 'epoch': 2} {'type': 'loss', 'content': 0.012807800434529781, 'timestamp': '2025-09-04 03:57:25.611697', 'step': 1369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:25.714660', 'step': 1369, 'epoch': 2} {'type': 'loss', 'content': 0.012346881441771984, 'timestamp': '2025-09-04 03:57:25.733731', 'step': 1370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:25.812931', 'step': 1370, 'epoch': 2} {'type': 'loss', 'content': 0.08287136256694794, 'timestamp': '2025-09-04 03:57:25.826896', 'step': 1371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:25.934910', 'step': 1371, 'epoch': 2} {'type': 'loss', 'content': 0.020002219825983047, 'timestamp': '2025-09-04 03:57:25.955901', 'step': 1372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:26.064243', 'step': 1372, 'epoch': 2} {'type': 'loss', 'content': 0.11615262180566788, 'timestamp': '2025-09-04 03:57:26.084428', 'step': 1373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:57:26.196826', 'step': 1373, 'epoch': 2} {'type': 'loss', 'content': 0.006950510665774345, 'timestamp': '2025-09-04 03:57:26.217171', 'step': 1374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:26.324298', 'step': 1374, 'epoch': 2} {'type': 'loss', 'content': 0.008125067688524723, 'timestamp': '2025-09-04 03:57:26.344166', 'step': 1375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:57:26.427863', 'step': 1375, 'epoch': 2} {'type': 'loss', 'content': 0.027247097343206406, 'timestamp': '2025-09-04 03:57:26.443748', 'step': 1376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:26.525142', 'step': 1376, 'epoch': 2} {'type': 'loss', 'content': 0.0010083671659231186, 'timestamp': '2025-09-04 03:57:26.541440', 'step': 1377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:26.641242', 'step': 1377, 'epoch': 2} {'type': 'loss', 'content': 0.021171875298023224, 'timestamp': '2025-09-04 03:57:26.659635', 'step': 1378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:57:26.770114', 'step': 1378, 'epoch': 2} {'type': 'loss', 'content': 0.05313687399029732, 'timestamp': '2025-09-04 03:57:26.790735', 'step': 1379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:26.897067', 'step': 1379, 'epoch': 2} {'type': 'loss', 'content': 0.009001716040074825, 'timestamp': '2025-09-04 03:57:26.917597', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:35.371165', 'step': 1380, 'epoch': 2} {'type': 'pplx', 'content': 356.8696831217775, 'timestamp': '2025-09-04 03:57:35.374916', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:35.479063', 'step': 1380, 'epoch': 2} {'type': 'loss', 'content': 0.03470727056264877, 'timestamp': '2025-09-04 03:57:35.501369', 'step': 1381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:35.598653', 'step': 1381, 'epoch': 2} {'type': 'loss', 'content': 0.04004732519388199, 'timestamp': '2025-09-04 03:57:35.616139', 'step': 1382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:35.722177', 'step': 1382, 'epoch': 2} {'type': 'loss', 'content': 0.03036014549434185, 'timestamp': '2025-09-04 03:57:35.741436', 'step': 1383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:35.849375', 'step': 1383, 'epoch': 2} {'type': 'loss', 'content': 0.05383269488811493, 'timestamp': '2025-09-04 03:57:35.870112', 'step': 1384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:35.963863', 'step': 1384, 'epoch': 2} {'type': 'loss', 'content': 0.10220647603273392, 'timestamp': '2025-09-04 03:57:35.983063', 'step': 1385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:57:36.070880', 'step': 1385, 'epoch': 2} {'type': 'loss', 'content': 0.010271705687046051, 'timestamp': '2025-09-04 03:57:36.086507', 'step': 1386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:36.190060', 'step': 1386, 'epoch': 2} {'type': 'loss', 'content': 0.018055927008390427, 'timestamp': '2025-09-04 03:57:36.209350', 'step': 1387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:36.288183', 'step': 1387, 'epoch': 2} {'type': 'loss', 'content': 0.0347399078309536, 'timestamp': '2025-09-04 03:57:36.303106', 'step': 1388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:57:36.374190', 'step': 1388, 'epoch': 2} {'type': 'loss', 'content': 0.017108459025621414, 'timestamp': '2025-09-04 03:57:36.388356', 'step': 1389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:36.467290', 'step': 1389, 'epoch': 2} {'type': 'loss', 'content': 0.019396977499127388, 'timestamp': '2025-09-04 03:57:36.481460', 'step': 1390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:36.584044', 'step': 1390, 'epoch': 2} {'type': 'loss', 'content': 0.029010986909270287, 'timestamp': '2025-09-04 03:57:36.603372', 'step': 1391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:57:36.688236', 'step': 1391, 'epoch': 2} {'type': 'loss', 'content': 0.05589752271771431, 'timestamp': '2025-09-04 03:57:36.704586', 'step': 1392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:36.795039', 'step': 1392, 'epoch': 2} {'type': 'loss', 'content': 0.027025602757930756, 'timestamp': '2025-09-04 03:57:36.813941', 'step': 1393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:57:36.936119', 'step': 1393, 'epoch': 2} {'type': 'loss', 'content': 0.0015675474423915148, 'timestamp': '2025-09-04 03:57:36.959466', 'step': 1394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:37.044245', 'step': 1394, 'epoch': 2} {'type': 'loss', 'content': 0.02568882144987583, 'timestamp': '2025-09-04 03:57:37.059306', 'step': 1395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:37.158661', 'step': 1395, 'epoch': 2} {'type': 'loss', 'content': 0.06579367071390152, 'timestamp': '2025-09-04 03:57:37.178098', 'step': 1396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:57:37.275495', 'step': 1396, 'epoch': 2} {'type': 'loss', 'content': 0.013132071122527122, 'timestamp': '2025-09-04 03:57:37.296202', 'step': 1397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:57:37.402412', 'step': 1397, 'epoch': 2} {'type': 'loss', 'content': 0.00368613563477993, 'timestamp': '2025-09-04 03:57:37.422511', 'step': 1398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:37.520438', 'step': 1398, 'epoch': 2} {'type': 'loss', 'content': 0.027636591345071793, 'timestamp': '2025-09-04 03:57:37.539120', 'step': 1399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:37.633885', 'step': 1399, 'epoch': 2} {'type': 'loss', 'content': 0.01690077967941761, 'timestamp': '2025-09-04 03:57:37.652181', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:46.023669', 'step': 1400, 'epoch': 2} {'type': 'pplx', 'content': 355.6197814395891, 'timestamp': '2025-09-04 03:57:46.025721', 'step': 1400, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1400', 'timestamp': '2025-09-04 03:57:46.538383', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:57:46.671119', 'step': 1400, 'epoch': 2} {'type': 'loss', 'content': 0.035729728639125824, 'timestamp': '2025-09-04 03:57:46.699862', 'step': 1401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:46.790258', 'step': 1401, 'epoch': 2} {'type': 'loss', 'content': 0.06573422253131866, 'timestamp': '2025-09-04 03:57:46.806890', 'step': 1402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:57:46.909490', 'step': 1402, 'epoch': 2} {'type': 'loss', 'content': 0.014802222140133381, 'timestamp': '2025-09-04 03:57:46.928221', 'step': 1403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:47.030134', 'step': 1403, 'epoch': 2} {'type': 'loss', 'content': 0.03306787088513374, 'timestamp': '2025-09-04 03:57:47.049863', 'step': 1404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:57:47.133628', 'step': 1404, 'epoch': 2} {'type': 'loss', 'content': 0.014069135300815105, 'timestamp': '2025-09-04 03:57:47.150471', 'step': 1405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:57:47.254620', 'step': 1405, 'epoch': 2} {'type': 'loss', 'content': 0.011790101416409016, 'timestamp': '2025-09-04 03:57:47.273633', 'step': 1406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:57:47.357993', 'step': 1406, 'epoch': 2} {'type': 'loss', 'content': 0.036905501037836075, 'timestamp': '2025-09-04 03:57:47.373168', 'step': 1407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:57:47.490818', 'step': 1407, 'epoch': 2} {'type': 'loss', 'content': 0.04053547978401184, 'timestamp': '2025-09-04 03:57:47.513458', 'step': 1408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:47.610652', 'step': 1408, 'epoch': 2} {'type': 'loss', 'content': 0.10574720799922943, 'timestamp': '2025-09-04 03:57:47.630819', 'step': 1409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:47.726719', 'step': 1409, 'epoch': 2} {'type': 'loss', 'content': 0.03774774447083473, 'timestamp': '2025-09-04 03:57:47.743637', 'step': 1410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:57:47.855271', 'step': 1410, 'epoch': 2} {'type': 'loss', 'content': 0.11015975475311279, 'timestamp': '2025-09-04 03:57:47.875714', 'step': 1411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:47.975623', 'step': 1411, 'epoch': 2} {'type': 'loss', 'content': 0.031532928347587585, 'timestamp': '2025-09-04 03:57:47.994819', 'step': 1412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:48.084984', 'step': 1412, 'epoch': 2} {'type': 'loss', 'content': 0.017254313454031944, 'timestamp': '2025-09-04 03:57:48.103711', 'step': 1413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:57:48.180312', 'step': 1413, 'epoch': 2} {'type': 'loss', 'content': 0.12299531698226929, 'timestamp': '2025-09-04 03:57:48.194006', 'step': 1414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:48.289289', 'step': 1414, 'epoch': 2} {'type': 'loss', 'content': 0.011667820625007153, 'timestamp': '2025-09-04 03:57:48.306300', 'step': 1415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:48.415936', 'step': 1415, 'epoch': 2} {'type': 'loss', 'content': 0.06725858896970749, 'timestamp': '2025-09-04 03:57:48.435803', 'step': 1416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:57:48.528418', 'step': 1416, 'epoch': 2} {'type': 'loss', 'content': 0.022478509694337845, 'timestamp': '2025-09-04 03:57:48.547300', 'step': 1417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:48.656960', 'step': 1417, 'epoch': 2} {'type': 'loss', 'content': 0.034010887145996094, 'timestamp': '2025-09-04 03:57:48.677147', 'step': 1418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:48.775323', 'step': 1418, 'epoch': 2} {'type': 'loss', 'content': 0.030013922601938248, 'timestamp': '2025-09-04 03:57:48.792730', 'step': 1419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:57:48.910608', 'step': 1419, 'epoch': 2} {'type': 'loss', 'content': 0.004974214360117912, 'timestamp': '2025-09-04 03:57:48.933542', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:57:57.300582', 'step': 1420, 'epoch': 2} {'type': 'pplx', 'content': 352.19748924586287, 'timestamp': '2025-09-04 03:57:57.303117', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 03:57:57.483232', 'step': 1420, 'epoch': 2} {'type': 'loss', 'content': 0.004329991061240435, 'timestamp': '2025-09-04 03:57:57.521231', 'step': 1421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:57.605182', 'step': 1421, 'epoch': 2} {'type': 'loss', 'content': 0.012752629816532135, 'timestamp': '2025-09-04 03:57:57.620094', 'step': 1422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:57:57.737791', 'step': 1422, 'epoch': 2} {'type': 'loss', 'content': 0.011430691927671432, 'timestamp': '2025-09-04 03:57:57.759708', 'step': 1423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:57:57.834971', 'step': 1423, 'epoch': 2} {'type': 'loss', 'content': 0.09056932479143143, 'timestamp': '2025-09-04 03:57:57.849283', 'step': 1424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:57.928832', 'step': 1424, 'epoch': 2} {'type': 'loss', 'content': 0.05736755579710007, 'timestamp': '2025-09-04 03:57:57.944122', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:57:58.034731', 'step': 1425, 'epoch': 2} {'type': 'loss', 'content': 0.010904895141720772, 'timestamp': '2025-09-04 03:57:58.051389', 'step': 1426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:57:58.161440', 'step': 1426, 'epoch': 2} {'type': 'loss', 'content': 0.015301964245736599, 'timestamp': '2025-09-04 03:57:58.181793', 'step': 1427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:57:58.278459', 'step': 1427, 'epoch': 2} {'type': 'loss', 'content': 0.006432659458369017, 'timestamp': '2025-09-04 03:57:58.296416', 'step': 1428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:57:58.393057', 'step': 1428, 'epoch': 2} {'type': 'loss', 'content': 0.04648457467556, 'timestamp': '2025-09-04 03:57:58.413285', 'step': 1429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:57:58.498896', 'step': 1429, 'epoch': 2} {'type': 'loss', 'content': 0.012602617032825947, 'timestamp': '2025-09-04 03:57:58.513769', 'step': 1430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:57:58.622078', 'step': 1430, 'epoch': 2} {'type': 'loss', 'content': 0.00956171378493309, 'timestamp': '2025-09-04 03:57:58.642137', 'step': 1431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:57:58.736074', 'step': 1431, 'epoch': 2} {'type': 'loss', 'content': 0.20023715496063232, 'timestamp': '2025-09-04 03:57:58.753934', 'step': 1432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:57:58.831296', 'step': 1432, 'epoch': 2} {'type': 'loss', 'content': 0.03244839608669281, 'timestamp': '2025-09-04 03:57:58.846617', 'step': 1433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:57:58.947316', 'step': 1433, 'epoch': 2} {'type': 'loss', 'content': 0.019599279388785362, 'timestamp': '2025-09-04 03:57:58.965931', 'step': 1434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:57:59.083309', 'step': 1434, 'epoch': 2} {'type': 'loss', 'content': 0.016808245331048965, 'timestamp': '2025-09-04 03:57:59.105217', 'step': 1435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:57:59.216073', 'step': 1435, 'epoch': 2} {'type': 'loss', 'content': 0.017745893448591232, 'timestamp': '2025-09-04 03:57:59.237285', 'step': 1436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:59.336156', 'step': 1436, 'epoch': 2} {'type': 'loss', 'content': 0.007716418243944645, 'timestamp': '2025-09-04 03:57:59.356987', 'step': 1437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:59.452348', 'step': 1437, 'epoch': 2} {'type': 'loss', 'content': 0.0655415803194046, 'timestamp': '2025-09-04 03:57:59.469660', 'step': 1438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:57:59.572428', 'step': 1438, 'epoch': 2} {'type': 'loss', 'content': 0.039280664175748825, 'timestamp': '2025-09-04 03:57:59.591378', 'step': 1439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:57:59.686127', 'step': 1439, 'epoch': 2} {'type': 'loss', 'content': 0.047219302505254745, 'timestamp': '2025-09-04 03:57:59.704167', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:58:08.097411', 'step': 1440, 'epoch': 2} {'type': 'pplx', 'content': 347.4976195756163, 'timestamp': '2025-09-04 03:58:08.099469', 'step': 1440, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1440', 'timestamp': '2025-09-04 03:58:08.613639', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:08.712984', 'step': 1440, 'epoch': 2} {'type': 'loss', 'content': 0.022071918472647667, 'timestamp': '2025-09-04 03:58:08.733921', 'step': 1441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:08.842886', 'step': 1441, 'epoch': 2} {'type': 'loss', 'content': 0.02069196105003357, 'timestamp': '2025-09-04 03:58:08.862926', 'step': 1442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:08.972102', 'step': 1442, 'epoch': 2} {'type': 'loss', 'content': 0.001259633689187467, 'timestamp': '2025-09-04 03:58:08.992159', 'step': 1443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:09.098857', 'step': 1443, 'epoch': 2} {'type': 'loss', 'content': 0.05347844213247299, 'timestamp': '2025-09-04 03:58:09.119414', 'step': 1444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 03:58:09.245780', 'step': 1444, 'epoch': 2} {'type': 'loss', 'content': 0.05717499926686287, 'timestamp': '2025-09-04 03:58:09.272832', 'step': 1445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:09.373325', 'step': 1445, 'epoch': 2} {'type': 'loss', 'content': 0.040185511112213135, 'timestamp': '2025-09-04 03:58:09.392288', 'step': 1446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:09.500710', 'step': 1446, 'epoch': 2} {'type': 'loss', 'content': 0.024745700880885124, 'timestamp': '2025-09-04 03:58:09.520583', 'step': 1447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:09.628530', 'step': 1447, 'epoch': 2} {'type': 'loss', 'content': 0.011510748416185379, 'timestamp': '2025-09-04 03:58:09.649508', 'step': 1448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:58:09.734063', 'step': 1448, 'epoch': 2} {'type': 'loss', 'content': 0.014999719336628914, 'timestamp': '2025-09-04 03:58:09.750818', 'step': 1449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:58:09.861652', 'step': 1449, 'epoch': 2} {'type': 'loss', 'content': 0.006128712557256222, 'timestamp': '2025-09-04 03:58:09.882168', 'step': 1450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:09.985238', 'step': 1450, 'epoch': 2} {'type': 'loss', 'content': 0.013433975167572498, 'timestamp': '2025-09-04 03:58:10.004317', 'step': 1451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:58:10.081730', 'step': 1451, 'epoch': 2} {'type': 'loss', 'content': 0.007306399755179882, 'timestamp': '2025-09-04 03:58:10.096553', 'step': 1452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:58:10.188094', 'step': 1452, 'epoch': 2} {'type': 'loss', 'content': 0.004065982531756163, 'timestamp': '2025-09-04 03:58:10.207086', 'step': 1453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:10.307494', 'step': 1453, 'epoch': 2} {'type': 'loss', 'content': 0.005130627192556858, 'timestamp': '2025-09-04 03:58:10.326129', 'step': 1454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:58:10.404524', 'step': 1454, 'epoch': 2} {'type': 'loss', 'content': 0.01915198192000389, 'timestamp': '2025-09-04 03:58:10.418656', 'step': 1455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:10.519421', 'step': 1455, 'epoch': 2} {'type': 'loss', 'content': 0.006536707282066345, 'timestamp': '2025-09-04 03:58:10.538875', 'step': 1456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:58:10.645308', 'step': 1456, 'epoch': 2} {'type': 'loss', 'content': 0.029277930036187172, 'timestamp': '2025-09-04 03:58:10.667816', 'step': 1457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:58:10.784355', 'step': 1457, 'epoch': 2} {'type': 'loss', 'content': 0.01957911066710949, 'timestamp': '2025-09-04 03:58:10.806480', 'step': 1458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:58:10.916254', 'step': 1458, 'epoch': 2} {'type': 'loss', 'content': 0.009834788739681244, 'timestamp': '2025-09-04 03:58:10.936503', 'step': 1459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:11.044736', 'step': 1459, 'epoch': 2} {'type': 'loss', 'content': 0.040386419743299484, 'timestamp': '2025-09-04 03:58:11.065853', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:58:19.451938', 'step': 1460, 'epoch': 2} {'type': 'pplx', 'content': 348.4467862100084, 'timestamp': '2025-09-04 03:58:19.453968', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:58:19.549185', 'step': 1460, 'epoch': 2} {'type': 'loss', 'content': 0.05862518399953842, 'timestamp': '2025-09-04 03:58:19.569593', 'step': 1461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:58:19.653427', 'step': 1461, 'epoch': 2} {'type': 'loss', 'content': 0.010720998048782349, 'timestamp': '2025-09-04 03:58:19.668367', 'step': 1462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:19.769165', 'step': 1462, 'epoch': 2} {'type': 'loss', 'content': 0.0041922107338905334, 'timestamp': '2025-09-04 03:58:19.788193', 'step': 1463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:58:19.871897', 'step': 1463, 'epoch': 2} {'type': 'loss', 'content': 0.02185150980949402, 'timestamp': '2025-09-04 03:58:19.887593', 'step': 1464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:19.978920', 'step': 1464, 'epoch': 2} {'type': 'loss', 'content': 0.02324114739894867, 'timestamp': '2025-09-04 03:58:19.997814', 'step': 1465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:58:20.089399', 'step': 1465, 'epoch': 2} {'type': 'loss', 'content': 0.01927514187991619, 'timestamp': '2025-09-04 03:58:20.105963', 'step': 1466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:58:20.200168', 'step': 1466, 'epoch': 2} {'type': 'loss', 'content': 0.029533013701438904, 'timestamp': '2025-09-04 03:58:20.217003', 'step': 1467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:20.321673', 'step': 1467, 'epoch': 2} {'type': 'loss', 'content': 0.06191162019968033, 'timestamp': '2025-09-04 03:58:20.341487', 'step': 1468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:20.444498', 'step': 1468, 'epoch': 2} {'type': 'loss', 'content': 0.008275226689875126, 'timestamp': '2025-09-04 03:58:20.466423', 'step': 1469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:20.562481', 'step': 1469, 'epoch': 2} {'type': 'loss', 'content': 0.020667975768446922, 'timestamp': '2025-09-04 03:58:20.579977', 'step': 1470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:58:20.653299', 'step': 1470, 'epoch': 2} {'type': 'loss', 'content': 0.023650793358683586, 'timestamp': '2025-09-04 03:58:20.665998', 'step': 1471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:20.767543', 'step': 1471, 'epoch': 2} {'type': 'loss', 'content': 0.02776520885527134, 'timestamp': '2025-09-04 03:58:20.786811', 'step': 1472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:20.864142', 'step': 1472, 'epoch': 2} {'type': 'loss', 'content': 0.01794314943253994, 'timestamp': '2025-09-04 03:58:20.878780', 'step': 1473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:20.990087', 'step': 1473, 'epoch': 2} {'type': 'loss', 'content': 0.028163762763142586, 'timestamp': '2025-09-04 03:58:21.009697', 'step': 1474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:21.118248', 'step': 1474, 'epoch': 2} {'type': 'loss', 'content': 0.016635507345199585, 'timestamp': '2025-09-04 03:58:21.137598', 'step': 1475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:58:21.223856', 'step': 1475, 'epoch': 2} {'type': 'loss', 'content': 0.09255839139223099, 'timestamp': '2025-09-04 03:58:21.239630', 'step': 1476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:21.338871', 'step': 1476, 'epoch': 2} {'type': 'loss', 'content': 0.03798213601112366, 'timestamp': '2025-09-04 03:58:21.359255', 'step': 1477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:21.460978', 'step': 1477, 'epoch': 2} {'type': 'loss', 'content': 0.02084563672542572, 'timestamp': '2025-09-04 03:58:21.479589', 'step': 1478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:58:21.563333', 'step': 1478, 'epoch': 2} {'type': 'loss', 'content': 0.009020745754241943, 'timestamp': '2025-09-04 03:58:21.578184', 'step': 1479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:21.688017', 'step': 1479, 'epoch': 2} {'type': 'loss', 'content': 0.0042407093569636345, 'timestamp': '2025-09-04 03:58:21.708855', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:58:30.096181', 'step': 1480, 'epoch': 2} {'type': 'pplx', 'content': 350.9113278716737, 'timestamp': '2025-09-04 03:58:30.098321', 'step': 1480, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1480', 'timestamp': '2025-09-04 03:58:30.453234', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:30.528236', 'step': 1480, 'epoch': 2} {'type': 'loss', 'content': 0.042050596326589584, 'timestamp': '2025-09-04 03:58:30.543158', 'step': 1481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:30.651692', 'step': 1481, 'epoch': 2} {'type': 'loss', 'content': 0.023750372231006622, 'timestamp': '2025-09-04 03:58:30.672015', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:30.776255', 'step': 1482, 'epoch': 2} {'type': 'loss', 'content': 0.008751154877245426, 'timestamp': '2025-09-04 03:58:30.795315', 'step': 1483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:30.872932', 'step': 1483, 'epoch': 2} {'type': 'loss', 'content': 0.013106334023177624, 'timestamp': '2025-09-04 03:58:30.887472', 'step': 1484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 03:58:31.005957', 'step': 1484, 'epoch': 2} {'type': 'loss', 'content': 0.029924411326646805, 'timestamp': '2025-09-04 03:58:31.029590', 'step': 1485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:58:31.130014', 'step': 1485, 'epoch': 2} {'type': 'loss', 'content': 0.024293435737490654, 'timestamp': '2025-09-04 03:58:31.148274', 'step': 1486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:31.257554', 'step': 1486, 'epoch': 2} {'type': 'loss', 'content': 0.06716310232877731, 'timestamp': '2025-09-04 03:58:31.277164', 'step': 1487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:31.384124', 'step': 1487, 'epoch': 2} {'type': 'loss', 'content': 0.02108912356197834, 'timestamp': '2025-09-04 03:58:31.404686', 'step': 1488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:31.481010', 'step': 1488, 'epoch': 2} {'type': 'loss', 'content': 0.07503022998571396, 'timestamp': '2025-09-04 03:58:31.496061', 'step': 1489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:58:31.581333', 'step': 1489, 'epoch': 2} {'type': 'loss', 'content': 0.0028276981320232153, 'timestamp': '2025-09-04 03:58:31.596330', 'step': 1490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:58:31.689572', 'step': 1490, 'epoch': 2} {'type': 'loss', 'content': 0.02281711809337139, 'timestamp': '2025-09-04 03:58:31.706718', 'step': 1491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:58:31.816423', 'step': 1491, 'epoch': 2} {'type': 'loss', 'content': 0.026799194514751434, 'timestamp': '2025-09-04 03:58:31.837569', 'step': 1492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:58:31.945511', 'step': 1492, 'epoch': 2} {'type': 'loss', 'content': 0.027529401704669, 'timestamp': '2025-09-04 03:58:31.968207', 'step': 1493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:32.077972', 'step': 1493, 'epoch': 2} {'type': 'loss', 'content': 0.0032774689607322216, 'timestamp': '2025-09-04 03:58:32.097948', 'step': 1494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:32.193448', 'step': 1494, 'epoch': 2} {'type': 'loss', 'content': 0.03091510199010372, 'timestamp': '2025-09-04 03:58:32.210579', 'step': 1495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:32.289594', 'step': 1495, 'epoch': 2} {'type': 'loss', 'content': 0.018100092187523842, 'timestamp': '2025-09-04 03:58:32.303714', 'step': 1496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:32.405510', 'step': 1496, 'epoch': 2} {'type': 'loss', 'content': 0.06709955632686615, 'timestamp': '2025-09-04 03:58:32.425599', 'step': 1497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:32.526213', 'step': 1497, 'epoch': 2} {'type': 'loss', 'content': 0.016714708879590034, 'timestamp': '2025-09-04 03:58:32.543025', 'step': 1498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:58:32.641548', 'step': 1498, 'epoch': 2} {'type': 'loss', 'content': 0.009932179003953934, 'timestamp': '2025-09-04 03:58:32.658618', 'step': 1499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:58:32.771050', 'step': 1499, 'epoch': 2} {'type': 'loss', 'content': 0.002495410619303584, 'timestamp': '2025-09-04 03:58:32.792042', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:58:41.271072', 'step': 1500, 'epoch': 2} {'type': 'pplx', 'content': 352.31429226333887, 'timestamp': '2025-09-04 03:58:41.273723', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:58:41.354514', 'step': 1500, 'epoch': 2} {'type': 'loss', 'content': 0.11568693816661835, 'timestamp': '2025-09-04 03:58:41.371425', 'step': 1501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:41.475740', 'step': 1501, 'epoch': 2} {'type': 'loss', 'content': 0.01670590229332447, 'timestamp': '2025-09-04 03:58:41.494996', 'step': 1502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:58:41.589828', 'step': 1502, 'epoch': 2} {'type': 'loss', 'content': 0.019445618614554405, 'timestamp': '2025-09-04 03:58:41.606867', 'step': 1503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:58:41.685355', 'step': 1503, 'epoch': 2} {'type': 'loss', 'content': 0.057172223925590515, 'timestamp': '2025-09-04 03:58:41.700285', 'step': 1504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:41.800836', 'step': 1504, 'epoch': 2} {'type': 'loss', 'content': 0.00811680406332016, 'timestamp': '2025-09-04 03:58:41.821852', 'step': 1505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:41.904781', 'step': 1505, 'epoch': 2} {'type': 'loss', 'content': 0.04069969430565834, 'timestamp': '2025-09-04 03:58:41.918645', 'step': 1506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:58:42.006065', 'step': 1506, 'epoch': 2} {'type': 'loss', 'content': 0.011170770972967148, 'timestamp': '2025-09-04 03:58:42.019930', 'step': 1507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:42.124593', 'step': 1507, 'epoch': 2} {'type': 'loss', 'content': 0.011762701906263828, 'timestamp': '2025-09-04 03:58:42.144303', 'step': 1508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:42.250667', 'step': 1508, 'epoch': 2} {'type': 'loss', 'content': 0.004596356768161058, 'timestamp': '2025-09-04 03:58:42.272663', 'step': 1509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:42.379939', 'step': 1509, 'epoch': 2} {'type': 'loss', 'content': 0.005542484112083912, 'timestamp': '2025-09-04 03:58:42.399644', 'step': 1510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:42.504187', 'step': 1510, 'epoch': 2} {'type': 'loss', 'content': 0.048978500068187714, 'timestamp': '2025-09-04 03:58:42.523284', 'step': 1511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:42.627086', 'step': 1511, 'epoch': 2} {'type': 'loss', 'content': 0.0095089515671134, 'timestamp': '2025-09-04 03:58:42.646752', 'step': 1512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:42.745841', 'step': 1512, 'epoch': 2} {'type': 'loss', 'content': 0.012734484858810902, 'timestamp': '2025-09-04 03:58:42.766995', 'step': 1513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 03:58:42.971571', 'step': 1513, 'epoch': 2} {'type': 'loss', 'content': 0.0094110993668437, 'timestamp': '2025-09-04 03:58:43.010715', 'step': 1514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:43.087980', 'step': 1514, 'epoch': 2} {'type': 'loss', 'content': 0.006989278364926577, 'timestamp': '2025-09-04 03:58:43.101850', 'step': 1515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:43.202917', 'step': 1515, 'epoch': 2} {'type': 'loss', 'content': 0.03085504285991192, 'timestamp': '2025-09-04 03:58:43.222336', 'step': 1516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:58:43.320624', 'step': 1516, 'epoch': 2} {'type': 'loss', 'content': 0.0065007261000573635, 'timestamp': '2025-09-04 03:58:43.340918', 'step': 1517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:43.443356', 'step': 1517, 'epoch': 2} {'type': 'loss', 'content': 0.020510029047727585, 'timestamp': '2025-09-04 03:58:43.462464', 'step': 1518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:58:43.561904', 'step': 1518, 'epoch': 2} {'type': 'loss', 'content': 0.010851016268134117, 'timestamp': '2025-09-04 03:58:43.580231', 'step': 1519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:58:43.681348', 'step': 1519, 'epoch': 2} {'type': 'loss', 'content': 0.011947129853069782, 'timestamp': '2025-09-04 03:58:43.700623', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:58:52.103185', 'step': 1520, 'epoch': 2} {'type': 'pplx', 'content': 354.340881585885, 'timestamp': '2025-09-04 03:58:52.104933', 'step': 1520, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1520', 'timestamp': '2025-09-04 03:58:52.596824', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:52.671295', 'step': 1520, 'epoch': 2} {'type': 'loss', 'content': 0.004985039122402668, 'timestamp': '2025-09-04 03:58:52.686417', 'step': 1521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:52.763053', 'step': 1521, 'epoch': 2} {'type': 'loss', 'content': 0.04416259378194809, 'timestamp': '2025-09-04 03:58:52.776998', 'step': 1522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:58:52.885793', 'step': 1522, 'epoch': 2} {'type': 'loss', 'content': 0.07206206768751144, 'timestamp': '2025-09-04 03:58:52.906087', 'step': 1523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:58:52.999974', 'step': 1523, 'epoch': 2} {'type': 'loss', 'content': 0.011458688415586948, 'timestamp': '2025-09-04 03:58:53.017860', 'step': 1524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:53.116338', 'step': 1524, 'epoch': 2} {'type': 'loss', 'content': 0.014014442451298237, 'timestamp': '2025-09-04 03:58:53.137043', 'step': 1525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:53.238547', 'step': 1525, 'epoch': 2} {'type': 'loss', 'content': 0.007480642758309841, 'timestamp': '2025-09-04 03:58:53.257330', 'step': 1526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 03:58:53.477272', 'step': 1526, 'epoch': 2} {'type': 'loss', 'content': 0.0391855351626873, 'timestamp': '2025-09-04 03:58:53.519609', 'step': 1527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:58:53.622364', 'step': 1527, 'epoch': 2} {'type': 'loss', 'content': 0.002516398439183831, 'timestamp': '2025-09-04 03:58:53.642357', 'step': 1528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:58:53.717657', 'step': 1528, 'epoch': 2} {'type': 'loss', 'content': 0.028322208672761917, 'timestamp': '2025-09-04 03:58:53.732958', 'step': 1529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1200], 'flops': 24000145761984.0}, 'timestamp': '2025-09-04 03:58:53.908721', 'step': 1529, 'epoch': 2} {'type': 'loss', 'content': 0.0025973438750952482, 'timestamp': '2025-09-04 03:58:53.941662', 'step': 1530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:54.036743', 'step': 1530, 'epoch': 2} {'type': 'loss', 'content': 0.06388135254383087, 'timestamp': '2025-09-04 03:58:54.054194', 'step': 1531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:58:54.160713', 'step': 1531, 'epoch': 2} {'type': 'loss', 'content': 0.006396736484020948, 'timestamp': '2025-09-04 03:58:54.181459', 'step': 1532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:58:54.286647', 'step': 1532, 'epoch': 2} {'type': 'loss', 'content': 0.028667569160461426, 'timestamp': '2025-09-04 03:58:54.308867', 'step': 1533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:54.410070', 'step': 1533, 'epoch': 2} {'type': 'loss', 'content': 0.11603422462940216, 'timestamp': '2025-09-04 03:58:54.428882', 'step': 1534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:54.534808', 'step': 1534, 'epoch': 2} {'type': 'loss', 'content': 0.011094557121396065, 'timestamp': '2025-09-04 03:58:54.553867', 'step': 1535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:58:54.757887', 'step': 1535, 'epoch': 2} {'type': 'loss', 'content': 0.02706741727888584, 'timestamp': '2025-09-04 03:58:54.797893', 'step': 1536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:58:54.892179', 'step': 1536, 'epoch': 2} {'type': 'loss', 'content': 0.01568935438990593, 'timestamp': '2025-09-04 03:58:54.911469', 'step': 1537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:58:54.995117', 'step': 1537, 'epoch': 2} {'type': 'loss', 'content': 0.009411556646227837, 'timestamp': '2025-09-04 03:58:55.010256', 'step': 1538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:58:55.118774', 'step': 1538, 'epoch': 2} {'type': 'loss', 'content': 0.01080233883112669, 'timestamp': '2025-09-04 03:58:55.138032', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:58:55.237800', 'step': 1539, 'epoch': 2} {'type': 'loss', 'content': 0.016464075073599815, 'timestamp': '2025-09-04 03:58:55.257398', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:03.697121', 'step': 1540, 'epoch': 2} {'type': 'pplx', 'content': 358.13731037704423, 'timestamp': '2025-09-04 03:59:03.699487', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:03.800069', 'step': 1540, 'epoch': 2} {'type': 'loss', 'content': 0.017000077292323112, 'timestamp': '2025-09-04 03:59:03.821025', 'step': 1541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:59:03.926237', 'step': 1541, 'epoch': 2} {'type': 'loss', 'content': 0.018205707892775536, 'timestamp': '2025-09-04 03:59:03.945068', 'step': 1542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:59:04.069382', 'step': 1542, 'epoch': 2} {'type': 'loss', 'content': 0.032110054045915604, 'timestamp': '2025-09-04 03:59:04.091913', 'step': 1543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:04.202077', 'step': 1543, 'epoch': 2} {'type': 'loss', 'content': 0.0027143440674990416, 'timestamp': '2025-09-04 03:59:04.222663', 'step': 1544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:59:04.332288', 'step': 1544, 'epoch': 2} {'type': 'loss', 'content': 0.030216865241527557, 'timestamp': '2025-09-04 03:59:04.354365', 'step': 1545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:04.465187', 'step': 1545, 'epoch': 2} {'type': 'loss', 'content': 0.03675489500164986, 'timestamp': '2025-09-04 03:59:04.484744', 'step': 1546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:59:04.562572', 'step': 1546, 'epoch': 2} {'type': 'loss', 'content': 0.03915338218212128, 'timestamp': '2025-09-04 03:59:04.576156', 'step': 1547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:59:04.714903', 'step': 1547, 'epoch': 2} {'type': 'loss', 'content': 0.0727081373333931, 'timestamp': '2025-09-04 03:59:04.741282', 'step': 1548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:59:04.819762', 'step': 1548, 'epoch': 2} {'type': 'loss', 'content': 0.03566645830869675, 'timestamp': '2025-09-04 03:59:04.834687', 'step': 1549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:04.939334', 'step': 1549, 'epoch': 2} {'type': 'loss', 'content': 0.020596234127879143, 'timestamp': '2025-09-04 03:59:04.957967', 'step': 1550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:05.060642', 'step': 1550, 'epoch': 2} {'type': 'loss', 'content': 0.00818456057459116, 'timestamp': '2025-09-04 03:59:05.078828', 'step': 1551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:05.189583', 'step': 1551, 'epoch': 2} {'type': 'loss', 'content': 0.05260307341814041, 'timestamp': '2025-09-04 03:59:05.210406', 'step': 1552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:05.313980', 'step': 1552, 'epoch': 2} {'type': 'loss', 'content': 0.006511691492050886, 'timestamp': '2025-09-04 03:59:05.334487', 'step': 1553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:05.440815', 'step': 1553, 'epoch': 2} {'type': 'loss', 'content': 0.009990708902478218, 'timestamp': '2025-09-04 03:59:05.459864', 'step': 1554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 03:59:05.636814', 'step': 1554, 'epoch': 2} {'type': 'loss', 'content': 0.004030467942357063, 'timestamp': '2025-09-04 03:59:05.668857', 'step': 1555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:59:05.756761', 'step': 1555, 'epoch': 2} {'type': 'loss', 'content': 0.031922031193971634, 'timestamp': '2025-09-04 03:59:05.772959', 'step': 1556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:05.865809', 'step': 1556, 'epoch': 2} {'type': 'loss', 'content': 0.021661344915628433, 'timestamp': '2025-09-04 03:59:05.884706', 'step': 1557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:05.979801', 'step': 1557, 'epoch': 2} {'type': 'loss', 'content': 0.01415644958615303, 'timestamp': '2025-09-04 03:59:05.996754', 'step': 1558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:06.091347', 'step': 1558, 'epoch': 2} {'type': 'loss', 'content': 0.0253834780305624, 'timestamp': '2025-09-04 03:59:06.108290', 'step': 1559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:06.210438', 'step': 1559, 'epoch': 2} {'type': 'loss', 'content': 0.008007410913705826, 'timestamp': '2025-09-04 03:59:06.229849', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:14.729217', 'step': 1560, 'epoch': 2} {'type': 'pplx', 'content': 359.43894958396004, 'timestamp': '2025-09-04 03:59:14.731389', 'step': 1560, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1560', 'timestamp': '2025-09-04 03:59:15.095862', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:15.200111', 'step': 1560, 'epoch': 2} {'type': 'loss', 'content': 0.012292720377445221, 'timestamp': '2025-09-04 03:59:15.222316', 'step': 1561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:15.322245', 'step': 1561, 'epoch': 2} {'type': 'loss', 'content': 0.03446212038397789, 'timestamp': '2025-09-04 03:59:15.340799', 'step': 1562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:15.440100', 'step': 1562, 'epoch': 2} {'type': 'loss', 'content': 0.0038441934157162905, 'timestamp': '2025-09-04 03:59:15.458765', 'step': 1563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:59:15.569481', 'step': 1563, 'epoch': 2} {'type': 'loss', 'content': 0.009270180948078632, 'timestamp': '2025-09-04 03:59:15.590580', 'step': 1564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:15.683992', 'step': 1564, 'epoch': 2} {'type': 'loss', 'content': 0.0015130855608731508, 'timestamp': '2025-09-04 03:59:15.702558', 'step': 1565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:59:15.812496', 'step': 1565, 'epoch': 2} {'type': 'loss', 'content': 0.001221196842379868, 'timestamp': '2025-09-04 03:59:15.833028', 'step': 1566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 03:59:15.917093', 'step': 1566, 'epoch': 2} {'type': 'loss', 'content': 0.01908346638083458, 'timestamp': '2025-09-04 03:59:15.931975', 'step': 1567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:59:16.037182', 'step': 1567, 'epoch': 2} {'type': 'loss', 'content': 0.010458694770932198, 'timestamp': '2025-09-04 03:59:16.056869', 'step': 1568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 03:59:16.254852', 'step': 1568, 'epoch': 2} {'type': 'loss', 'content': 0.03718806803226471, 'timestamp': '2025-09-04 03:59:16.297722', 'step': 1569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:59:16.414907', 'step': 1569, 'epoch': 2} {'type': 'loss', 'content': 0.006845841184258461, 'timestamp': '2025-09-04 03:59:16.437167', 'step': 1570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:16.531656', 'step': 1570, 'epoch': 2} {'type': 'loss', 'content': 0.017309976741671562, 'timestamp': '2025-09-04 03:59:16.549235', 'step': 1571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:16.656486', 'step': 1571, 'epoch': 2} {'type': 'loss', 'content': 0.038009658455848694, 'timestamp': '2025-09-04 03:59:16.677504', 'step': 1572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:16.781767', 'step': 1572, 'epoch': 2} {'type': 'loss', 'content': 0.005971661768853664, 'timestamp': '2025-09-04 03:59:16.803460', 'step': 1573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:59:16.939010', 'step': 1573, 'epoch': 2} {'type': 'loss', 'content': 0.0011978168040513992, 'timestamp': '2025-09-04 03:59:16.964950', 'step': 1574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:59:17.050202', 'step': 1574, 'epoch': 2} {'type': 'loss', 'content': 0.06003192439675331, 'timestamp': '2025-09-04 03:59:17.065248', 'step': 1575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:59:17.155313', 'step': 1575, 'epoch': 2} {'type': 'loss', 'content': 0.007113071624189615, 'timestamp': '2025-09-04 03:59:17.172663', 'step': 1576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:17.269765', 'step': 1576, 'epoch': 2} {'type': 'loss', 'content': 0.012526609003543854, 'timestamp': '2025-09-04 03:59:17.289917', 'step': 1577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 03:59:17.373409', 'step': 1577, 'epoch': 2} {'type': 'loss', 'content': 0.05469227954745293, 'timestamp': '2025-09-04 03:59:17.388299', 'step': 1578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:17.481308', 'step': 1578, 'epoch': 2} {'type': 'loss', 'content': 0.020947255194187164, 'timestamp': '2025-09-04 03:59:17.498309', 'step': 1579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:59:17.570426', 'step': 1579, 'epoch': 2} {'type': 'loss', 'content': 0.049049898982048035, 'timestamp': '2025-09-04 03:59:17.583920', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:25.958273', 'step': 1580, 'epoch': 2} {'type': 'pplx', 'content': 357.77819634386793, 'timestamp': '2025-09-04 03:59:25.959793', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:59:26.031511', 'step': 1580, 'epoch': 2} {'type': 'loss', 'content': 0.022066203877329826, 'timestamp': '2025-09-04 03:59:26.046549', 'step': 1581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:59:26.124819', 'step': 1581, 'epoch': 2} {'type': 'loss', 'content': 0.02584720402956009, 'timestamp': '2025-09-04 03:59:26.139065', 'step': 1582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:26.233903', 'step': 1582, 'epoch': 2} {'type': 'loss', 'content': 0.004149719141423702, 'timestamp': '2025-09-04 03:59:26.251416', 'step': 1583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:26.344125', 'step': 1583, 'epoch': 2} {'type': 'loss', 'content': 0.026423068717122078, 'timestamp': '2025-09-04 03:59:26.361696', 'step': 1584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:26.458179', 'step': 1584, 'epoch': 2} {'type': 'loss', 'content': 0.014465868473052979, 'timestamp': '2025-09-04 03:59:26.478437', 'step': 1585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 03:59:26.595337', 'step': 1585, 'epoch': 2} {'type': 'loss', 'content': 0.04363749548792839, 'timestamp': '2025-09-04 03:59:26.617500', 'step': 1586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:26.711320', 'step': 1586, 'epoch': 2} {'type': 'loss', 'content': 0.021764691919088364, 'timestamp': '2025-09-04 03:59:26.728639', 'step': 1587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 03:59:26.866445', 'step': 1587, 'epoch': 2} {'type': 'loss', 'content': 0.008350017480552197, 'timestamp': '2025-09-04 03:59:26.893339', 'step': 1588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:26.983861', 'step': 1588, 'epoch': 2} {'type': 'loss', 'content': 0.038341108709573746, 'timestamp': '2025-09-04 03:59:27.002454', 'step': 1589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:27.096991', 'step': 1589, 'epoch': 2} {'type': 'loss', 'content': 0.033245623111724854, 'timestamp': '2025-09-04 03:59:27.114289', 'step': 1590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:27.214904', 'step': 1590, 'epoch': 2} {'type': 'loss', 'content': 0.025423089042305946, 'timestamp': '2025-09-04 03:59:27.233747', 'step': 1591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 03:59:27.314232', 'step': 1591, 'epoch': 2} {'type': 'loss', 'content': 0.07140816748142242, 'timestamp': '2025-09-04 03:59:27.328990', 'step': 1592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:27.426941', 'step': 1592, 'epoch': 2} {'type': 'loss', 'content': 0.06752584129571915, 'timestamp': '2025-09-04 03:59:27.447667', 'step': 1593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 03:59:27.519642', 'step': 1593, 'epoch': 2} {'type': 'loss', 'content': 0.06612498313188553, 'timestamp': '2025-09-04 03:59:27.532132', 'step': 1594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:27.640395', 'step': 1594, 'epoch': 2} {'type': 'loss', 'content': 0.011722175404429436, 'timestamp': '2025-09-04 03:59:27.660381', 'step': 1595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:27.759366', 'step': 1595, 'epoch': 2} {'type': 'loss', 'content': 0.008636832237243652, 'timestamp': '2025-09-04 03:59:27.778747', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:59:27.862555', 'step': 1596, 'epoch': 2} {'type': 'loss', 'content': 0.015985198318958282, 'timestamp': '2025-09-04 03:59:27.879266', 'step': 1597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:27.977074', 'step': 1597, 'epoch': 2} {'type': 'loss', 'content': 0.011449925601482391, 'timestamp': '2025-09-04 03:59:27.994541', 'step': 1598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:28.099296', 'step': 1598, 'epoch': 2} {'type': 'loss', 'content': 0.005783349275588989, 'timestamp': '2025-09-04 03:59:28.117980', 'step': 1599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 03:59:28.227054', 'step': 1599, 'epoch': 2} {'type': 'loss', 'content': 0.017474643886089325, 'timestamp': '2025-09-04 03:59:28.248142', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:36.640768', 'step': 1600, 'epoch': 2} {'type': 'pplx', 'content': 352.57666281747646, 'timestamp': '2025-09-04 03:59:36.643336', 'step': 1600, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1600', 'timestamp': '2025-09-04 03:59:37.160974', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:37.258635', 'step': 1600, 'epoch': 2} {'type': 'loss', 'content': 0.004377053584903479, 'timestamp': '2025-09-04 03:59:37.278760', 'step': 1601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:59:37.357328', 'step': 1601, 'epoch': 2} {'type': 'loss', 'content': 0.09427284449338913, 'timestamp': '2025-09-04 03:59:37.371013', 'step': 1602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:37.468118', 'step': 1602, 'epoch': 2} {'type': 'loss', 'content': 0.006842142436653376, 'timestamp': '2025-09-04 03:59:37.485384', 'step': 1603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:37.593709', 'step': 1603, 'epoch': 2} {'type': 'loss', 'content': 0.04602169245481491, 'timestamp': '2025-09-04 03:59:37.614356', 'step': 1604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:37.717365', 'step': 1604, 'epoch': 2} {'type': 'loss', 'content': 0.0032124435529112816, 'timestamp': '2025-09-04 03:59:37.738360', 'step': 1605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:37.850506', 'step': 1605, 'epoch': 2} {'type': 'loss', 'content': 0.0026126240845769644, 'timestamp': '2025-09-04 03:59:37.870407', 'step': 1606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:59:37.962373', 'step': 1606, 'epoch': 2} {'type': 'loss', 'content': 0.0179721862077713, 'timestamp': '2025-09-04 03:59:37.979015', 'step': 1607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:38.075975', 'step': 1607, 'epoch': 2} {'type': 'loss', 'content': 0.03628922253847122, 'timestamp': '2025-09-04 03:59:38.094017', 'step': 1608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:38.196918', 'step': 1608, 'epoch': 2} {'type': 'loss', 'content': 0.037857189774513245, 'timestamp': '2025-09-04 03:59:38.217694', 'step': 1609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:59:38.295044', 'step': 1609, 'epoch': 2} {'type': 'loss', 'content': 0.01975720003247261, 'timestamp': '2025-09-04 03:59:38.308225', 'step': 1610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:59:38.415102', 'step': 1610, 'epoch': 2} {'type': 'loss', 'content': 0.049535252153873444, 'timestamp': '2025-09-04 03:59:38.433713', 'step': 1611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:38.534698', 'step': 1611, 'epoch': 2} {'type': 'loss', 'content': 0.01265799067914486, 'timestamp': '2025-09-04 03:59:38.554118', 'step': 1612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:59:38.638435', 'step': 1612, 'epoch': 2} {'type': 'loss', 'content': 0.09150931984186172, 'timestamp': '2025-09-04 03:59:38.655552', 'step': 1613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:38.754553', 'step': 1613, 'epoch': 2} {'type': 'loss', 'content': 0.007259078789502382, 'timestamp': '2025-09-04 03:59:38.773143', 'step': 1614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 03:59:38.858195', 'step': 1614, 'epoch': 2} {'type': 'loss', 'content': 0.02660290338099003, 'timestamp': '2025-09-04 03:59:38.873585', 'step': 1615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 03:59:38.995688', 'step': 1615, 'epoch': 2} {'type': 'loss', 'content': 0.007390860002487898, 'timestamp': '2025-09-04 03:59:39.019381', 'step': 1616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:39.113144', 'step': 1616, 'epoch': 2} {'type': 'loss', 'content': 0.016929682344198227, 'timestamp': '2025-09-04 03:59:39.132006', 'step': 1617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:39.238588', 'step': 1617, 'epoch': 2} {'type': 'loss', 'content': 0.006743302568793297, 'timestamp': '2025-09-04 03:59:39.258325', 'step': 1618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:39.373616', 'step': 1618, 'epoch': 2} {'type': 'loss', 'content': 0.017765972763299942, 'timestamp': '2025-09-04 03:59:39.393491', 'step': 1619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:59:39.530286', 'step': 1619, 'epoch': 2} {'type': 'loss', 'content': 0.023081377148628235, 'timestamp': '2025-09-04 03:59:39.556794', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:47.940129', 'step': 1620, 'epoch': 2} {'type': 'pplx', 'content': 344.5752767610482, 'timestamp': '2025-09-04 03:59:47.944701', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 03:59:48.061987', 'step': 1620, 'epoch': 2} {'type': 'loss', 'content': 0.012214818969368935, 'timestamp': '2025-09-04 03:59:48.087346', 'step': 1621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:48.182192', 'step': 1621, 'epoch': 2} {'type': 'loss', 'content': 0.0025064749643206596, 'timestamp': '2025-09-04 03:59:48.199612', 'step': 1622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:59:48.334327', 'step': 1622, 'epoch': 2} {'type': 'loss', 'content': 0.013956296257674694, 'timestamp': '2025-09-04 03:59:48.360138', 'step': 1623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 03:59:48.459381', 'step': 1623, 'epoch': 2} {'type': 'loss', 'content': 0.02604658156633377, 'timestamp': '2025-09-04 03:59:48.478589', 'step': 1624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:59:48.592076', 'step': 1624, 'epoch': 2} {'type': 'loss', 'content': 0.015420682728290558, 'timestamp': '2025-09-04 03:59:48.614371', 'step': 1625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:59:48.717389', 'step': 1625, 'epoch': 2} {'type': 'loss', 'content': 0.02754286862909794, 'timestamp': '2025-09-04 03:59:48.736363', 'step': 1626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 03:59:48.812596', 'step': 1626, 'epoch': 2} {'type': 'loss', 'content': 0.0014424566179513931, 'timestamp': '2025-09-04 03:59:48.826120', 'step': 1627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:48.920932', 'step': 1627, 'epoch': 2} {'type': 'loss', 'content': 0.015860071405768394, 'timestamp': '2025-09-04 03:59:48.938622', 'step': 1628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 03:59:49.029533', 'step': 1628, 'epoch': 2} {'type': 'loss', 'content': 0.022340765222907066, 'timestamp': '2025-09-04 03:59:49.048431', 'step': 1629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 03:59:49.141575', 'step': 1629, 'epoch': 2} {'type': 'loss', 'content': 0.06079462915658951, 'timestamp': '2025-09-04 03:59:49.158388', 'step': 1630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:49.259259', 'step': 1630, 'epoch': 2} {'type': 'loss', 'content': 0.024208059534430504, 'timestamp': '2025-09-04 03:59:49.277857', 'step': 1631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:59:49.354899', 'step': 1631, 'epoch': 2} {'type': 'loss', 'content': 0.01721222698688507, 'timestamp': '2025-09-04 03:59:49.369440', 'step': 1632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 03:59:49.462980', 'step': 1632, 'epoch': 2} {'type': 'loss', 'content': 0.023982705548405647, 'timestamp': '2025-09-04 03:59:49.482064', 'step': 1633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:59:49.560407', 'step': 1633, 'epoch': 2} {'type': 'loss', 'content': 0.04129756614565849, 'timestamp': '2025-09-04 03:59:49.574243', 'step': 1634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:49.677225', 'step': 1634, 'epoch': 2} {'type': 'loss', 'content': 0.010030495934188366, 'timestamp': '2025-09-04 03:59:49.696269', 'step': 1635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 03:59:49.798566', 'step': 1635, 'epoch': 2} {'type': 'loss', 'content': 0.011695167981088161, 'timestamp': '2025-09-04 03:59:49.818213', 'step': 1636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 03:59:49.906625', 'step': 1636, 'epoch': 2} {'type': 'loss', 'content': 0.038352031260728836, 'timestamp': '2025-09-04 03:59:49.924748', 'step': 1637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 03:59:50.035679', 'step': 1637, 'epoch': 2} {'type': 'loss', 'content': 0.04370134696364403, 'timestamp': '2025-09-04 03:59:50.056082', 'step': 1638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:50.155714', 'step': 1638, 'epoch': 2} {'type': 'loss', 'content': 0.0329529233276844, 'timestamp': '2025-09-04 03:59:50.174460', 'step': 1639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 03:59:50.260151', 'step': 1639, 'epoch': 2} {'type': 'loss', 'content': 0.07019810378551483, 'timestamp': '2025-09-04 03:59:50.276294', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 03:59:58.677353', 'step': 1640, 'epoch': 2} {'type': 'pplx', 'content': 337.12616741483475, 'timestamp': '2025-09-04 03:59:58.679743', 'step': 1640, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1640', 'timestamp': '2025-09-04 03:59:59.042413', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 03:59:59.118171', 'step': 1640, 'epoch': 2} {'type': 'loss', 'content': 0.008293285965919495, 'timestamp': '2025-09-04 03:59:59.133120', 'step': 1641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 03:59:59.207261', 'step': 1641, 'epoch': 2} {'type': 'loss', 'content': 0.032241858541965485, 'timestamp': '2025-09-04 03:59:59.220518', 'step': 1642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 03:59:59.327128', 'step': 1642, 'epoch': 2} {'type': 'loss', 'content': 0.022532925009727478, 'timestamp': '2025-09-04 03:59:59.347081', 'step': 1643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 03:59:59.456331', 'step': 1643, 'epoch': 2} {'type': 'loss', 'content': 0.02694687992334366, 'timestamp': '2025-09-04 03:59:59.477507', 'step': 1644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 03:59:59.574944', 'step': 1644, 'epoch': 2} {'type': 'loss', 'content': 0.04047459363937378, 'timestamp': '2025-09-04 03:59:59.595569', 'step': 1645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 03:59:59.733413', 'step': 1645, 'epoch': 2} {'type': 'loss', 'content': 0.029626868665218353, 'timestamp': '2025-09-04 03:59:59.759206', 'step': 1646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 03:59:59.863484', 'step': 1646, 'epoch': 2} {'type': 'loss', 'content': 0.06880754232406616, 'timestamp': '2025-09-04 03:59:59.882549', 'step': 1647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 03:59:59.955783', 'step': 1647, 'epoch': 2} {'type': 'loss', 'content': 0.01954088918864727, 'timestamp': '2025-09-04 03:59:59.969438', 'step': 1648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:00.065814', 'step': 1648, 'epoch': 2} {'type': 'loss', 'content': 0.03680575639009476, 'timestamp': '2025-09-04 04:00:00.085936', 'step': 1649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:00.190966', 'step': 1649, 'epoch': 2} {'type': 'loss', 'content': 0.05359693989157677, 'timestamp': '2025-09-04 04:00:00.210087', 'step': 1650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:00:00.285825', 'step': 1650, 'epoch': 2} {'type': 'loss', 'content': 0.010023823007941246, 'timestamp': '2025-09-04 04:00:00.299422', 'step': 1651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:00:00.382070', 'step': 1651, 'epoch': 2} {'type': 'loss', 'content': 0.012647191993892193, 'timestamp': '2025-09-04 04:00:00.397724', 'step': 1652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:00.494834', 'step': 1652, 'epoch': 2} {'type': 'loss', 'content': 0.024116551503539085, 'timestamp': '2025-09-04 04:00:00.515369', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:00:00.602080', 'step': 1653, 'epoch': 2} {'type': 'loss', 'content': 0.012986271642148495, 'timestamp': '2025-09-04 04:00:00.617510', 'step': 1654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:00:00.704538', 'step': 1654, 'epoch': 2} {'type': 'loss', 'content': 0.0540911890566349, 'timestamp': '2025-09-04 04:00:00.719990', 'step': 1655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:00.820816', 'step': 1655, 'epoch': 2} {'type': 'loss', 'content': 0.007742506917566061, 'timestamp': '2025-09-04 04:00:00.840248', 'step': 1656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:00:00.959478', 'step': 1656, 'epoch': 2} {'type': 'loss', 'content': 0.010772481560707092, 'timestamp': '2025-09-04 04:00:00.984758', 'step': 1657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:00:01.065179', 'step': 1657, 'epoch': 2} {'type': 'loss', 'content': 0.02090446837246418, 'timestamp': '2025-09-04 04:00:01.079075', 'step': 1658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:00:01.186014', 'step': 1658, 'epoch': 2} {'type': 'loss', 'content': 0.001743293716572225, 'timestamp': '2025-09-04 04:00:01.205789', 'step': 1659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 04:00:01.334218', 'step': 1659, 'epoch': 2} {'type': 'loss', 'content': 0.010634549893438816, 'timestamp': '2025-09-04 04:00:01.359030', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:00:09.817753', 'step': 1660, 'epoch': 2} {'type': 'pplx', 'content': 331.6593289744211, 'timestamp': '2025-09-04 04:00:09.820816', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:00:09.894815', 'step': 1660, 'epoch': 2} {'type': 'loss', 'content': 0.02064441703259945, 'timestamp': '2025-09-04 04:00:09.909470', 'step': 1661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:00:10.012229', 'step': 1661, 'epoch': 2} {'type': 'loss', 'content': 0.015596513636410236, 'timestamp': '2025-09-04 04:00:10.031405', 'step': 1662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:10.132215', 'step': 1662, 'epoch': 2} {'type': 'loss', 'content': 0.011088766157627106, 'timestamp': '2025-09-04 04:00:10.150840', 'step': 1663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:00:10.273945', 'step': 1663, 'epoch': 2} {'type': 'loss', 'content': 0.012567834928631783, 'timestamp': '2025-09-04 04:00:10.295058', 'step': 1664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:00:10.403246', 'step': 1664, 'epoch': 2} {'type': 'loss', 'content': 0.06560499221086502, 'timestamp': '2025-09-04 04:00:10.425808', 'step': 1665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:10.519387', 'step': 1665, 'epoch': 2} {'type': 'loss', 'content': 0.012841533869504929, 'timestamp': '2025-09-04 04:00:10.536398', 'step': 1666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:00:10.650045', 'step': 1666, 'epoch': 2} {'type': 'loss', 'content': 0.003418769920244813, 'timestamp': '2025-09-04 04:00:10.670606', 'step': 1667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:10.766374', 'step': 1667, 'epoch': 2} {'type': 'loss', 'content': 0.02538418211042881, 'timestamp': '2025-09-04 04:00:10.784387', 'step': 1668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:00:10.884595', 'step': 1668, 'epoch': 2} {'type': 'loss', 'content': 0.08917974680662155, 'timestamp': '2025-09-04 04:00:10.905588', 'step': 1669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:11.005904', 'step': 1669, 'epoch': 2} {'type': 'loss', 'content': 0.052654486149549484, 'timestamp': '2025-09-04 04:00:11.024342', 'step': 1670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:11.123525', 'step': 1670, 'epoch': 2} {'type': 'loss', 'content': 0.0234109815210104, 'timestamp': '2025-09-04 04:00:11.141882', 'step': 1671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:00:11.229789', 'step': 1671, 'epoch': 2} {'type': 'loss', 'content': 0.03294301778078079, 'timestamp': '2025-09-04 04:00:11.245909', 'step': 1672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:00:11.315897', 'step': 1672, 'epoch': 2} {'type': 'loss', 'content': 0.01805739291012287, 'timestamp': '2025-09-04 04:00:11.329916', 'step': 1673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:11.434633', 'step': 1673, 'epoch': 2} {'type': 'loss', 'content': 0.01714208535850048, 'timestamp': '2025-09-04 04:00:11.453840', 'step': 1674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 04:00:11.580861', 'step': 1674, 'epoch': 2} {'type': 'loss', 'content': 0.04294885694980621, 'timestamp': '2025-09-04 04:00:11.605060', 'step': 1675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1040], 'flops': 20800126336064.0}, 'timestamp': '2025-09-04 04:00:11.755808', 'step': 1675, 'epoch': 2} {'type': 'loss', 'content': 0.04310276731848717, 'timestamp': '2025-09-04 04:00:11.785965', 'step': 1676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:00:11.886494', 'step': 1676, 'epoch': 2} {'type': 'loss', 'content': 0.03321857005357742, 'timestamp': '2025-09-04 04:00:11.907370', 'step': 1677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:00:11.986545', 'step': 1677, 'epoch': 2} {'type': 'loss', 'content': 0.014885162003338337, 'timestamp': '2025-09-04 04:00:12.000644', 'step': 1678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:12.102838', 'step': 1678, 'epoch': 2} {'type': 'loss', 'content': 0.00889945961534977, 'timestamp': '2025-09-04 04:00:12.121999', 'step': 1679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:12.226444', 'step': 1679, 'epoch': 2} {'type': 'loss', 'content': 0.0010784993646666408, 'timestamp': '2025-09-04 04:00:12.243714', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:00:20.689461', 'step': 1680, 'epoch': 2} {'type': 'pplx', 'content': 331.5873085585238, 'timestamp': '2025-09-04 04:00:20.691749', 'step': 1680, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1680', 'timestamp': '2025-09-04 04:00:21.056049', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:21.143665', 'step': 1680, 'epoch': 2} {'type': 'loss', 'content': 0.007037813309580088, 'timestamp': '2025-09-04 04:00:21.161779', 'step': 1681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:21.272043', 'step': 1681, 'epoch': 2} {'type': 'loss', 'content': 0.006806929595768452, 'timestamp': '2025-09-04 04:00:21.292471', 'step': 1682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:21.396953', 'step': 1682, 'epoch': 2} {'type': 'loss', 'content': 0.0040665543638169765, 'timestamp': '2025-09-04 04:00:21.416216', 'step': 1683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:21.506686', 'step': 1683, 'epoch': 2} {'type': 'loss', 'content': 0.009878264740109444, 'timestamp': '2025-09-04 04:00:21.523945', 'step': 1684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:21.615259', 'step': 1684, 'epoch': 2} {'type': 'loss', 'content': 0.016732510179281235, 'timestamp': '2025-09-04 04:00:21.634022', 'step': 1685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:00:21.744343', 'step': 1685, 'epoch': 2} {'type': 'loss', 'content': 0.0031748716719448566, 'timestamp': '2025-09-04 04:00:21.764249', 'step': 1686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:21.856813', 'step': 1686, 'epoch': 2} {'type': 'loss', 'content': 0.04727555438876152, 'timestamp': '2025-09-04 04:00:21.874028', 'step': 1687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:00:21.952413', 'step': 1687, 'epoch': 2} {'type': 'loss', 'content': 0.011287051253020763, 'timestamp': '2025-09-04 04:00:21.967393', 'step': 1688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:22.067839', 'step': 1688, 'epoch': 2} {'type': 'loss', 'content': 0.012352891266345978, 'timestamp': '2025-09-04 04:00:22.088808', 'step': 1689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:00:22.166671', 'step': 1689, 'epoch': 2} {'type': 'loss', 'content': 0.041657764464616776, 'timestamp': '2025-09-04 04:00:22.180689', 'step': 1690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:00:22.258826', 'step': 1690, 'epoch': 2} {'type': 'loss', 'content': 0.013096681796014309, 'timestamp': '2025-09-04 04:00:22.272579', 'step': 1691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:00:22.359084', 'step': 1691, 'epoch': 2} {'type': 'loss', 'content': 0.010199989192187786, 'timestamp': '2025-09-04 04:00:22.375432', 'step': 1692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:22.472488', 'step': 1692, 'epoch': 2} {'type': 'loss', 'content': 0.009935924783349037, 'timestamp': '2025-09-04 04:00:22.492845', 'step': 1693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:22.586149', 'step': 1693, 'epoch': 2} {'type': 'loss', 'content': 0.0010472126305103302, 'timestamp': '2025-09-04 04:00:22.603478', 'step': 1694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:22.707303', 'step': 1694, 'epoch': 2} {'type': 'loss', 'content': 0.013101182878017426, 'timestamp': '2025-09-04 04:00:22.726713', 'step': 1695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:00:22.838549', 'step': 1695, 'epoch': 2} {'type': 'loss', 'content': 0.00893877912312746, 'timestamp': '2025-09-04 04:00:22.859904', 'step': 1696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:22.957935', 'step': 1696, 'epoch': 2} {'type': 'loss', 'content': 0.012058167718350887, 'timestamp': '2025-09-04 04:00:22.978574', 'step': 1697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:23.069815', 'step': 1697, 'epoch': 2} {'type': 'loss', 'content': 0.03133353590965271, 'timestamp': '2025-09-04 04:00:23.086718', 'step': 1698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:23.190329', 'step': 1698, 'epoch': 2} {'type': 'loss', 'content': 0.004476282745599747, 'timestamp': '2025-09-04 04:00:23.209733', 'step': 1699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:23.309449', 'step': 1699, 'epoch': 2} {'type': 'loss', 'content': 0.025943979620933533, 'timestamp': '2025-09-04 04:00:23.328940', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:00:31.826010', 'step': 1700, 'epoch': 2} {'type': 'pplx', 'content': 333.02580516691586, 'timestamp': '2025-09-04 04:00:31.828237', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:00:31.926110', 'step': 1700, 'epoch': 2} {'type': 'loss', 'content': 0.017599618062376976, 'timestamp': '2025-09-04 04:00:31.947097', 'step': 1701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:32.040262', 'step': 1701, 'epoch': 2} {'type': 'loss', 'content': 0.0015532653778791428, 'timestamp': '2025-09-04 04:00:32.057510', 'step': 1702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:00:32.143074', 'step': 1702, 'epoch': 2} {'type': 'loss', 'content': 0.019918687641620636, 'timestamp': '2025-09-04 04:00:32.158506', 'step': 1703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:00:32.232768', 'step': 1703, 'epoch': 2} {'type': 'loss', 'content': 0.005274395924061537, 'timestamp': '2025-09-04 04:00:32.246605', 'step': 1704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:32.337510', 'step': 1704, 'epoch': 2} {'type': 'loss', 'content': 0.01871996931731701, 'timestamp': '2025-09-04 04:00:32.356341', 'step': 1705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:00:32.441434', 'step': 1705, 'epoch': 2} {'type': 'loss', 'content': 0.04257494583725929, 'timestamp': '2025-09-04 04:00:32.456936', 'step': 1706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:32.552180', 'step': 1706, 'epoch': 2} {'type': 'loss', 'content': 0.020532267168164253, 'timestamp': '2025-09-04 04:00:32.569560', 'step': 1707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:00:32.645022', 'step': 1707, 'epoch': 2} {'type': 'loss', 'content': 0.004226659890264273, 'timestamp': '2025-09-04 04:00:32.659438', 'step': 1708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:32.766042', 'step': 1708, 'epoch': 2} {'type': 'loss', 'content': 0.008803565986454487, 'timestamp': '2025-09-04 04:00:32.788565', 'step': 1709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:00:32.895625', 'step': 1709, 'epoch': 2} {'type': 'loss', 'content': 0.02005820721387863, 'timestamp': '2025-09-04 04:00:32.915733', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:33.006906', 'step': 1710, 'epoch': 2} {'type': 'loss', 'content': 0.1214195266366005, 'timestamp': '2025-09-04 04:00:33.023783', 'step': 1711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:33.126759', 'step': 1711, 'epoch': 2} {'type': 'loss', 'content': 0.045690055936574936, 'timestamp': '2025-09-04 04:00:33.144887', 'step': 1712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:00:33.259834', 'step': 1712, 'epoch': 2} {'type': 'loss', 'content': 0.0035067156422883272, 'timestamp': '2025-09-04 04:00:33.284236', 'step': 1713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:33.378429', 'step': 1713, 'epoch': 2} {'type': 'loss', 'content': 0.012944525107741356, 'timestamp': '2025-09-04 04:00:33.395663', 'step': 1714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:33.501202', 'step': 1714, 'epoch': 2} {'type': 'loss', 'content': 0.0048626684583723545, 'timestamp': '2025-09-04 04:00:33.520594', 'step': 1715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:00:33.598180', 'step': 1715, 'epoch': 2} {'type': 'loss', 'content': 0.01824316196143627, 'timestamp': '2025-09-04 04:00:33.613069', 'step': 1716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:33.711266', 'step': 1716, 'epoch': 2} {'type': 'loss', 'content': 0.008359821513295174, 'timestamp': '2025-09-04 04:00:33.732074', 'step': 1717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:00:33.810997', 'step': 1717, 'epoch': 2} {'type': 'loss', 'content': 0.056332752108573914, 'timestamp': '2025-09-04 04:00:33.824856', 'step': 1718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:00:33.959830', 'step': 1718, 'epoch': 2} {'type': 'loss', 'content': 0.0033087804913520813, 'timestamp': '2025-09-04 04:00:33.985444', 'step': 1719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:34.087256', 'step': 1719, 'epoch': 2} {'type': 'loss', 'content': 0.0023557273671031, 'timestamp': '2025-09-04 04:00:34.106749', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:00:42.599713', 'step': 1720, 'epoch': 2} {'type': 'pplx', 'content': 333.25088739723225, 'timestamp': '2025-09-04 04:00:42.601861', 'step': 1720, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1720', 'timestamp': '2025-09-04 04:00:42.959913', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:43.064965', 'step': 1720, 'epoch': 2} {'type': 'loss', 'content': 0.03474944829940796, 'timestamp': '2025-09-04 04:00:43.087465', 'step': 1721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:43.198541', 'step': 1721, 'epoch': 2} {'type': 'loss', 'content': 0.005826589651405811, 'timestamp': '2025-09-04 04:00:43.218959', 'step': 1722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:00:43.339501', 'step': 1722, 'epoch': 2} {'type': 'loss', 'content': 0.0146811967715621, 'timestamp': '2025-09-04 04:00:43.361337', 'step': 1723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:00:43.445239', 'step': 1723, 'epoch': 2} {'type': 'loss', 'content': 0.027011031284928322, 'timestamp': '2025-09-04 04:00:43.461157', 'step': 1724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:43.555904', 'step': 1724, 'epoch': 2} {'type': 'loss', 'content': 0.009984579868614674, 'timestamp': '2025-09-04 04:00:43.574597', 'step': 1725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:43.674494', 'step': 1725, 'epoch': 2} {'type': 'loss', 'content': 0.03857394680380821, 'timestamp': '2025-09-04 04:00:43.693031', 'step': 1726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:43.786677', 'step': 1726, 'epoch': 2} {'type': 'loss', 'content': 0.0024763226974755526, 'timestamp': '2025-09-04 04:00:43.803892', 'step': 1727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:00:43.913846', 'step': 1727, 'epoch': 2} {'type': 'loss', 'content': 0.02332633174955845, 'timestamp': '2025-09-04 04:00:43.935398', 'step': 1728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:44.036422', 'step': 1728, 'epoch': 2} {'type': 'loss', 'content': 0.06381595879793167, 'timestamp': '2025-09-04 04:00:44.057407', 'step': 1729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:44.153790', 'step': 1729, 'epoch': 2} {'type': 'loss', 'content': 0.0381832979619503, 'timestamp': '2025-09-04 04:00:44.171248', 'step': 1730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:44.265710', 'step': 1730, 'epoch': 2} {'type': 'loss', 'content': 0.027034031227231026, 'timestamp': '2025-09-04 04:00:44.283200', 'step': 1731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:44.378747', 'step': 1731, 'epoch': 2} {'type': 'loss', 'content': 0.022275667637586594, 'timestamp': '2025-09-04 04:00:44.397024', 'step': 1732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:00:44.502743', 'step': 1732, 'epoch': 2} {'type': 'loss', 'content': 0.11091621220111847, 'timestamp': '2025-09-04 04:00:44.525023', 'step': 1733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:44.636832', 'step': 1733, 'epoch': 2} {'type': 'loss', 'content': 0.0019922046922147274, 'timestamp': '2025-09-04 04:00:44.657096', 'step': 1734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:44.757668', 'step': 1734, 'epoch': 2} {'type': 'loss', 'content': 0.001954772276803851, 'timestamp': '2025-09-04 04:00:44.776302', 'step': 1735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:44.879280', 'step': 1735, 'epoch': 2} {'type': 'loss', 'content': 0.0044233831577003, 'timestamp': '2025-09-04 04:00:44.898969', 'step': 1736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:00:44.981562', 'step': 1736, 'epoch': 2} {'type': 'loss', 'content': 0.0020844575483351946, 'timestamp': '2025-09-04 04:00:44.998177', 'step': 1737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:45.088833', 'step': 1737, 'epoch': 2} {'type': 'loss', 'content': 0.021219972521066666, 'timestamp': '2025-09-04 04:00:45.105675', 'step': 1738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:45.200766', 'step': 1738, 'epoch': 2} {'type': 'loss', 'content': 0.0035940525121986866, 'timestamp': '2025-09-04 04:00:45.218204', 'step': 1739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:45.322209', 'step': 1739, 'epoch': 2} {'type': 'loss', 'content': 0.004270992241799831, 'timestamp': '2025-09-04 04:00:45.342196', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:00:53.844925', 'step': 1740, 'epoch': 2} {'type': 'pplx', 'content': 332.1656256388727, 'timestamp': '2025-09-04 04:00:53.848701', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:53.945874', 'step': 1740, 'epoch': 2} {'type': 'loss', 'content': 0.011263997294008732, 'timestamp': '2025-09-04 04:00:53.966713', 'step': 1741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 04:00:54.186951', 'step': 1741, 'epoch': 2} {'type': 'loss', 'content': 0.0082445302978158, 'timestamp': '2025-09-04 04:00:54.229291', 'step': 1742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:54.333877', 'step': 1742, 'epoch': 2} {'type': 'loss', 'content': 0.015566142275929451, 'timestamp': '2025-09-04 04:00:54.353256', 'step': 1743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:54.446192', 'step': 1743, 'epoch': 2} {'type': 'loss', 'content': 0.007874183356761932, 'timestamp': '2025-09-04 04:00:54.464227', 'step': 1744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:00:54.538554', 'step': 1744, 'epoch': 2} {'type': 'loss', 'content': 0.03222401440143585, 'timestamp': '2025-09-04 04:00:54.553386', 'step': 1745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:54.665546', 'step': 1745, 'epoch': 2} {'type': 'loss', 'content': 0.005354198161512613, 'timestamp': '2025-09-04 04:00:54.686098', 'step': 1746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:00:54.773362', 'step': 1746, 'epoch': 2} {'type': 'loss', 'content': 0.02252938598394394, 'timestamp': '2025-09-04 04:00:54.789032', 'step': 1747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:54.880684', 'step': 1747, 'epoch': 2} {'type': 'loss', 'content': 0.016836240887641907, 'timestamp': '2025-09-04 04:00:54.898361', 'step': 1748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:00:55.003472', 'step': 1748, 'epoch': 2} {'type': 'loss', 'content': 0.060379642993211746, 'timestamp': '2025-09-04 04:00:55.025325', 'step': 1749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:00:55.127851', 'step': 1749, 'epoch': 2} {'type': 'loss', 'content': 0.00034839441650547087, 'timestamp': '2025-09-04 04:00:55.147125', 'step': 1750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:00:55.241660', 'step': 1750, 'epoch': 2} {'type': 'loss', 'content': 0.006921238731592894, 'timestamp': '2025-09-04 04:00:55.259134', 'step': 1751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:00:55.355817', 'step': 1751, 'epoch': 2} {'type': 'loss', 'content': 0.0024503259919583797, 'timestamp': '2025-09-04 04:00:55.373415', 'step': 1752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:00:55.474861', 'step': 1752, 'epoch': 2} {'type': 'loss', 'content': 0.013867728412151337, 'timestamp': '2025-09-04 04:00:55.495862', 'step': 1753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:55.593558', 'step': 1753, 'epoch': 2} {'type': 'loss', 'content': 0.012754159979522228, 'timestamp': '2025-09-04 04:00:55.611128', 'step': 1754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:00:55.720755', 'step': 1754, 'epoch': 2} {'type': 'loss', 'content': 0.016128217801451683, 'timestamp': '2025-09-04 04:00:55.741196', 'step': 1755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:00:55.832248', 'step': 1755, 'epoch': 2} {'type': 'loss', 'content': 0.002366261789575219, 'timestamp': '2025-09-04 04:00:55.849776', 'step': 1756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:00:55.931287', 'step': 1756, 'epoch': 2} {'type': 'loss', 'content': 0.012999413534998894, 'timestamp': '2025-09-04 04:00:55.947967', 'step': 1757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:00:56.042427', 'step': 1757, 'epoch': 2} {'type': 'loss', 'content': 0.011377224698662758, 'timestamp': '2025-09-04 04:00:56.059803', 'step': 1758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:00:56.160358', 'step': 1758, 'epoch': 2} {'type': 'loss', 'content': 0.028042022138834, 'timestamp': '2025-09-04 04:00:56.179270', 'step': 1759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:00:56.280616', 'step': 1759, 'epoch': 2} {'type': 'loss', 'content': 0.07089690864086151, 'timestamp': '2025-09-04 04:00:56.299883', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:01:04.797965', 'step': 1760, 'epoch': 2} {'type': 'pplx', 'content': 331.5460997167647, 'timestamp': '2025-09-04 04:01:04.799966', 'step': 1760, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1760', 'timestamp': '2025-09-04 04:01:05.165288', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:01:05.245129', 'step': 1760, 'epoch': 2} {'type': 'loss', 'content': 0.005329194013029337, 'timestamp': '2025-09-04 04:01:05.261502', 'step': 1761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:05.367704', 'step': 1761, 'epoch': 2} {'type': 'loss', 'content': 0.0019212173065170646, 'timestamp': '2025-09-04 04:01:05.387739', 'step': 1762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:05.482905', 'step': 1762, 'epoch': 2} {'type': 'loss', 'content': 0.011046413332223892, 'timestamp': '2025-09-04 04:01:05.500152', 'step': 1763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:01:05.603684', 'step': 1763, 'epoch': 2} {'type': 'loss', 'content': 0.02376765012741089, 'timestamp': '2025-09-04 04:01:05.623816', 'step': 1764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:05.765786', 'step': 1764, 'epoch': 2} {'type': 'loss', 'content': 0.02195976860821247, 'timestamp': '2025-09-04 04:01:05.786601', 'step': 1765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:01:05.872683', 'step': 1765, 'epoch': 2} {'type': 'loss', 'content': 0.06642767786979675, 'timestamp': '2025-09-04 04:01:05.887982', 'step': 1766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:01:06.093441', 'step': 1766, 'epoch': 2} {'type': 'loss', 'content': 0.0038195352535694838, 'timestamp': '2025-09-04 04:01:06.132739', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:01:06.245766', 'step': 1767, 'epoch': 2} {'type': 'loss', 'content': 0.026749003678560257, 'timestamp': '2025-09-04 04:01:06.267150', 'step': 1768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:01:06.342459', 'step': 1768, 'epoch': 2} {'type': 'loss', 'content': 0.032609354704618454, 'timestamp': '2025-09-04 04:01:06.357861', 'step': 1769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:01:06.472022', 'step': 1769, 'epoch': 2} {'type': 'loss', 'content': 0.02017885632812977, 'timestamp': '2025-09-04 04:01:06.492669', 'step': 1770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:01:06.570069', 'step': 1770, 'epoch': 2} {'type': 'loss', 'content': 0.01600290834903717, 'timestamp': '2025-09-04 04:01:06.583952', 'step': 1771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:01:06.656885', 'step': 1771, 'epoch': 2} {'type': 'loss', 'content': 0.0048013306222856045, 'timestamp': '2025-09-04 04:01:06.670727', 'step': 1772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:01:06.792023', 'step': 1772, 'epoch': 2} {'type': 'loss', 'content': 0.011499549262225628, 'timestamp': '2025-09-04 04:01:06.817313', 'step': 1773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:01:06.894357', 'step': 1773, 'epoch': 2} {'type': 'loss', 'content': 0.007216259371489286, 'timestamp': '2025-09-04 04:01:06.908347', 'step': 1774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 04:01:07.041806', 'step': 1774, 'epoch': 2} {'type': 'loss', 'content': 0.040591221302747726, 'timestamp': '2025-09-04 04:01:07.066436', 'step': 1775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:07.165680', 'step': 1775, 'epoch': 2} {'type': 'loss', 'content': 0.030231038108468056, 'timestamp': '2025-09-04 04:01:07.185187', 'step': 1776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:07.278526', 'step': 1776, 'epoch': 2} {'type': 'loss', 'content': 0.04355088993906975, 'timestamp': '2025-09-04 04:01:07.297576', 'step': 1777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:07.390198', 'step': 1777, 'epoch': 2} {'type': 'loss', 'content': 0.010329993441700935, 'timestamp': '2025-09-04 04:01:07.407455', 'step': 1778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:01:07.494403', 'step': 1778, 'epoch': 2} {'type': 'loss', 'content': 0.06678889691829681, 'timestamp': '2025-09-04 04:01:07.510066', 'step': 1779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:07.603681', 'step': 1779, 'epoch': 2} {'type': 'loss', 'content': 0.03957043215632439, 'timestamp': '2025-09-04 04:01:07.621544', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:01:16.109115', 'step': 1780, 'epoch': 2} {'type': 'pplx', 'content': 332.86792094420156, 'timestamp': '2025-09-04 04:01:16.111280', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:01:16.214758', 'step': 1780, 'epoch': 2} {'type': 'loss', 'content': 0.02120288833975792, 'timestamp': '2025-09-04 04:01:16.237163', 'step': 1781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:16.331514', 'step': 1781, 'epoch': 2} {'type': 'loss', 'content': 0.07391846925020218, 'timestamp': '2025-09-04 04:01:16.348990', 'step': 1782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:01:16.454583', 'step': 1782, 'epoch': 2} {'type': 'loss', 'content': 0.003579053794965148, 'timestamp': '2025-09-04 04:01:16.473828', 'step': 1783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:01:16.583998', 'step': 1783, 'epoch': 2} {'type': 'loss', 'content': 0.00399737898260355, 'timestamp': '2025-09-04 04:01:16.604981', 'step': 1784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:01:16.709762', 'step': 1784, 'epoch': 2} {'type': 'loss', 'content': 0.06767401099205017, 'timestamp': '2025-09-04 04:01:16.732053', 'step': 1785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:16.839280', 'step': 1785, 'epoch': 2} {'type': 'loss', 'content': 0.015294116921722889, 'timestamp': '2025-09-04 04:01:16.859246', 'step': 1786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:16.966615', 'step': 1786, 'epoch': 2} {'type': 'loss', 'content': 0.056523002684116364, 'timestamp': '2025-09-04 04:01:16.986545', 'step': 1787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:17.093325', 'step': 1787, 'epoch': 2} {'type': 'loss', 'content': 0.031245263293385506, 'timestamp': '2025-09-04 04:01:17.114048', 'step': 1788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:01:17.189344', 'step': 1788, 'epoch': 2} {'type': 'loss', 'content': 0.013243497349321842, 'timestamp': '2025-09-04 04:01:17.204416', 'step': 1789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:01:17.286843', 'step': 1789, 'epoch': 2} {'type': 'loss', 'content': 0.008389891125261784, 'timestamp': '2025-09-04 04:01:17.302064', 'step': 1790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:01:17.396893', 'step': 1790, 'epoch': 2} {'type': 'loss', 'content': 0.07410303503274918, 'timestamp': '2025-09-04 04:01:17.414517', 'step': 1791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:01:17.492341', 'step': 1791, 'epoch': 2} {'type': 'loss', 'content': 0.04840013012290001, 'timestamp': '2025-09-04 04:01:17.507244', 'step': 1792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:01:17.596637', 'step': 1792, 'epoch': 2} {'type': 'loss', 'content': 0.033458348363637924, 'timestamp': '2025-09-04 04:01:17.615173', 'step': 1793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:01:17.700214', 'step': 1793, 'epoch': 2} {'type': 'loss', 'content': 0.0031342965085059404, 'timestamp': '2025-09-04 04:01:17.715288', 'step': 1794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:01:17.823263', 'step': 1794, 'epoch': 2} {'type': 'loss', 'content': 0.004811432678252459, 'timestamp': '2025-09-04 04:01:17.843383', 'step': 1795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:17.943652', 'step': 1795, 'epoch': 2} {'type': 'loss', 'content': 0.060861192643642426, 'timestamp': '2025-09-04 04:01:17.963035', 'step': 1796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:01:18.038096', 'step': 1796, 'epoch': 2} {'type': 'loss', 'content': 0.0050119198858737946, 'timestamp': '2025-09-04 04:01:18.053478', 'step': 1797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:01:18.130769', 'step': 1797, 'epoch': 2} {'type': 'loss', 'content': 0.03676461800932884, 'timestamp': '2025-09-04 04:01:18.144495', 'step': 1798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:01:18.220106', 'step': 1798, 'epoch': 2} {'type': 'loss', 'content': 0.014785559847950935, 'timestamp': '2025-09-04 04:01:18.233902', 'step': 1799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:01:18.317658', 'step': 1799, 'epoch': 2} {'type': 'loss', 'content': 0.008279402740299702, 'timestamp': '2025-09-04 04:01:18.333743', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:01:26.804956', 'step': 1800, 'epoch': 2} {'type': 'pplx', 'content': 334.6393289394288, 'timestamp': '2025-09-04 04:01:26.807267', 'step': 1800, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1800', 'timestamp': '2025-09-04 04:01:27.163015', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:01:27.279145', 'step': 1800, 'epoch': 2} {'type': 'loss', 'content': 0.031706344336271286, 'timestamp': '2025-09-04 04:01:27.302897', 'step': 1801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:01:27.406870', 'step': 1801, 'epoch': 2} {'type': 'loss', 'content': 0.010657178238034248, 'timestamp': '2025-09-04 04:01:27.426238', 'step': 1802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:27.524874', 'step': 1802, 'epoch': 2} {'type': 'loss', 'content': 0.006731742527335882, 'timestamp': '2025-09-04 04:01:27.543242', 'step': 1803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:01:27.654231', 'step': 1803, 'epoch': 2} {'type': 'loss', 'content': 0.010640786960721016, 'timestamp': '2025-09-04 04:01:27.675378', 'step': 1804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:27.767989', 'step': 1804, 'epoch': 2} {'type': 'loss', 'content': 0.0017573751974850893, 'timestamp': '2025-09-04 04:01:27.786871', 'step': 1805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:01:27.861394', 'step': 1805, 'epoch': 2} {'type': 'loss', 'content': 0.0030143826734274626, 'timestamp': '2025-09-04 04:01:27.874925', 'step': 1806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:01:27.988082', 'step': 1806, 'epoch': 2} {'type': 'loss', 'content': 0.05524804815649986, 'timestamp': '2025-09-04 04:01:28.008638', 'step': 1807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:01:28.111875', 'step': 1807, 'epoch': 2} {'type': 'loss', 'content': 0.0070555852726101875, 'timestamp': '2025-09-04 04:01:28.131855', 'step': 1808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:28.237339', 'step': 1808, 'epoch': 2} {'type': 'loss', 'content': 0.0052083320915699005, 'timestamp': '2025-09-04 04:01:28.259221', 'step': 1809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:01:28.335457', 'step': 1809, 'epoch': 2} {'type': 'loss', 'content': 0.006322692148387432, 'timestamp': '2025-09-04 04:01:28.348911', 'step': 1810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:28.444431', 'step': 1810, 'epoch': 2} {'type': 'loss', 'content': 0.0009704896947368979, 'timestamp': '2025-09-04 04:01:28.461713', 'step': 1811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:28.563516', 'step': 1811, 'epoch': 2} {'type': 'loss', 'content': 0.032470621168613434, 'timestamp': '2025-09-04 04:01:28.582668', 'step': 1812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:28.690265', 'step': 1812, 'epoch': 2} {'type': 'loss', 'content': 0.09668878465890884, 'timestamp': '2025-09-04 04:01:28.711902', 'step': 1813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:01:28.825233', 'step': 1813, 'epoch': 2} {'type': 'loss', 'content': 0.043325960636138916, 'timestamp': '2025-09-04 04:01:28.845811', 'step': 1814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:01:28.919550', 'step': 1814, 'epoch': 2} {'type': 'loss', 'content': 0.03765127435326576, 'timestamp': '2025-09-04 04:01:28.932457', 'step': 1815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:29.027274', 'step': 1815, 'epoch': 2} {'type': 'loss', 'content': 0.0037220031954348087, 'timestamp': '2025-09-04 04:01:29.045400', 'step': 1816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:29.138382', 'step': 1816, 'epoch': 2} {'type': 'loss', 'content': 0.010741024278104305, 'timestamp': '2025-09-04 04:01:29.157429', 'step': 1817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:01:29.260861', 'step': 1817, 'epoch': 2} {'type': 'loss', 'content': 0.008968895301222801, 'timestamp': '2025-09-04 04:01:29.279763', 'step': 1818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:29.373806', 'step': 1818, 'epoch': 2} {'type': 'loss', 'content': 0.044184453785419464, 'timestamp': '2025-09-04 04:01:29.390644', 'step': 1819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:01:29.487551', 'step': 1819, 'epoch': 2} {'type': 'loss', 'content': 0.000747227284591645, 'timestamp': '2025-09-04 04:01:29.505764', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:01:37.982577', 'step': 1820, 'epoch': 2} {'type': 'pplx', 'content': 334.53241911357134, 'timestamp': '2025-09-04 04:01:37.984505', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:01:38.089414', 'step': 1820, 'epoch': 2} {'type': 'loss', 'content': 0.0014711732510477304, 'timestamp': '2025-09-04 04:01:38.112040', 'step': 1821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:38.205609', 'step': 1821, 'epoch': 2} {'type': 'loss', 'content': 0.010439584963023663, 'timestamp': '2025-09-04 04:01:38.222944', 'step': 1822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:38.323038', 'step': 1822, 'epoch': 2} {'type': 'loss', 'content': 0.012518877163529396, 'timestamp': '2025-09-04 04:01:38.341806', 'step': 1823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:01:38.444098', 'step': 1823, 'epoch': 2} {'type': 'loss', 'content': 0.04167770966887474, 'timestamp': '2025-09-04 04:01:38.463964', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:38.555702', 'step': 1824, 'epoch': 2} {'type': 'loss', 'content': 0.005846615415066481, 'timestamp': '2025-09-04 04:01:38.574612', 'step': 1825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:38.675901', 'step': 1825, 'epoch': 2} {'type': 'loss', 'content': 0.026406852528452873, 'timestamp': '2025-09-04 04:01:38.694683', 'step': 1826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 04:01:38.758694', 'step': 1826, 'epoch': 2} {'type': 'loss', 'content': 0.015116888098418713, 'timestamp': '2025-09-04 04:01:38.769907', 'step': 1827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:01:38.887154', 'step': 1827, 'epoch': 2} {'type': 'loss', 'content': 0.09362763166427612, 'timestamp': '2025-09-04 04:01:38.909974', 'step': 1828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:01:38.999028', 'step': 1828, 'epoch': 2} {'type': 'loss', 'content': 0.000753458240069449, 'timestamp': '2025-09-04 04:01:39.017403', 'step': 1829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:01:39.097095', 'step': 1829, 'epoch': 2} {'type': 'loss', 'content': 0.021402668207883835, 'timestamp': '2025-09-04 04:01:39.111346', 'step': 1830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:39.213078', 'step': 1830, 'epoch': 2} {'type': 'loss', 'content': 0.005958658177405596, 'timestamp': '2025-09-04 04:01:39.231929', 'step': 1831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:39.333890', 'step': 1831, 'epoch': 2} {'type': 'loss', 'content': 0.051472008228302, 'timestamp': '2025-09-04 04:01:39.353219', 'step': 1832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:01:39.437888', 'step': 1832, 'epoch': 2} {'type': 'loss', 'content': 0.030274610966444016, 'timestamp': '2025-09-04 04:01:39.454964', 'step': 1833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:01:39.562661', 'step': 1833, 'epoch': 2} {'type': 'loss', 'content': 0.007288443390280008, 'timestamp': '2025-09-04 04:01:39.582323', 'step': 1834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:39.676731', 'step': 1834, 'epoch': 2} {'type': 'loss', 'content': 0.006486637983471155, 'timestamp': '2025-09-04 04:01:39.693979', 'step': 1835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:39.793975', 'step': 1835, 'epoch': 2} {'type': 'loss', 'content': 0.0005186740309000015, 'timestamp': '2025-09-04 04:01:39.813264', 'step': 1836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:39.904684', 'step': 1836, 'epoch': 2} {'type': 'loss', 'content': 0.004083861596882343, 'timestamp': '2025-09-04 04:01:39.923216', 'step': 1837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:01:40.025797', 'step': 1837, 'epoch': 2} {'type': 'loss', 'content': 0.018782733008265495, 'timestamp': '2025-09-04 04:01:40.045011', 'step': 1838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:01:40.121633', 'step': 1838, 'epoch': 2} {'type': 'loss', 'content': 0.004725904669612646, 'timestamp': '2025-09-04 04:01:40.135403', 'step': 1839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:01:40.226335', 'step': 1839, 'epoch': 2} {'type': 'loss', 'content': 0.027214346453547478, 'timestamp': '2025-09-04 04:01:40.243999', 'step': 1840, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:01:48.732766', 'step': 1840, 'epoch': 2} {'type': 'pplx', 'content': 332.6223443151369, 'timestamp': '2025-09-04 04:01:48.735049', 'step': 1840, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1840', 'timestamp': '2025-09-04 04:01:49.155872', 'step': 1840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:01:49.243365', 'step': 1840, 'epoch': 2} {'type': 'loss', 'content': 0.00866016000509262, 'timestamp': '2025-09-04 04:01:49.260285', 'step': 1841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:01:49.383488', 'step': 1841, 'epoch': 2} {'type': 'loss', 'content': 0.02382933534681797, 'timestamp': '2025-09-04 04:01:49.406702', 'step': 1842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:49.501505', 'step': 1842, 'epoch': 2} {'type': 'loss', 'content': 0.018402768298983574, 'timestamp': '2025-09-04 04:01:49.518813', 'step': 1843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:01:49.615531', 'step': 1843, 'epoch': 2} {'type': 'loss', 'content': 0.0033138843718916178, 'timestamp': '2025-09-04 04:01:49.633725', 'step': 1844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:01:49.765165', 'step': 1844, 'epoch': 2} {'type': 'loss', 'content': 0.014491533860564232, 'timestamp': '2025-09-04 04:01:49.793552', 'step': 1845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:49.893752', 'step': 1845, 'epoch': 2} {'type': 'loss', 'content': 0.06931175291538239, 'timestamp': '2025-09-04 04:01:49.912401', 'step': 1846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:50.006776', 'step': 1846, 'epoch': 2} {'type': 'loss', 'content': 0.04427943378686905, 'timestamp': '2025-09-04 04:01:50.023915', 'step': 1847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:01:50.132323', 'step': 1847, 'epoch': 2} {'type': 'loss', 'content': 0.012512077577412128, 'timestamp': '2025-09-04 04:01:50.153223', 'step': 1848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:50.245240', 'step': 1848, 'epoch': 2} {'type': 'loss', 'content': 0.009834478609263897, 'timestamp': '2025-09-04 04:01:50.263906', 'step': 1849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:01:50.369649', 'step': 1849, 'epoch': 2} {'type': 'loss', 'content': 0.07198784500360489, 'timestamp': '2025-09-04 04:01:50.388693', 'step': 1850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:01:50.499515', 'step': 1850, 'epoch': 2} {'type': 'loss', 'content': 0.031000154092907906, 'timestamp': '2025-09-04 04:01:50.519893', 'step': 1851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:01:50.612703', 'step': 1851, 'epoch': 2} {'type': 'loss', 'content': 0.01771543361246586, 'timestamp': '2025-09-04 04:01:50.630038', 'step': 1852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:01:50.751707', 'step': 1852, 'epoch': 2} {'type': 'loss', 'content': 0.009000863879919052, 'timestamp': '2025-09-04 04:01:50.777057', 'step': 1853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:50.871081', 'step': 1853, 'epoch': 2} {'type': 'loss', 'content': 0.017614608630537987, 'timestamp': '2025-09-04 04:01:50.888216', 'step': 1854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:01:50.989658', 'step': 1854, 'epoch': 2} {'type': 'loss', 'content': 0.0047741541638970375, 'timestamp': '2025-09-04 04:01:51.008530', 'step': 1855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:01:51.099179', 'step': 1855, 'epoch': 2} {'type': 'loss', 'content': 0.029526762664318085, 'timestamp': '2025-09-04 04:01:51.116803', 'step': 1856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:01:51.208159', 'step': 1856, 'epoch': 2} {'type': 'loss', 'content': 0.01259857602417469, 'timestamp': '2025-09-04 04:01:51.227097', 'step': 1857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:01:51.305343', 'step': 1857, 'epoch': 2} {'type': 'loss', 'content': 0.02670917473733425, 'timestamp': '2025-09-04 04:01:51.318941', 'step': 1858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:01:51.419359', 'step': 1858, 'epoch': 2} {'type': 'loss', 'content': 0.0215139277279377, 'timestamp': '2025-09-04 04:01:51.437922', 'step': 1859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:01:51.523670', 'step': 1859, 'epoch': 2} {'type': 'loss', 'content': 0.013091914355754852, 'timestamp': '2025-09-04 04:01:51.540128', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:00.150071', 'step': 1860, 'epoch': 2} {'type': 'pplx', 'content': 328.99443454458276, 'timestamp': '2025-09-04 04:02:00.152296', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:02:00.234249', 'step': 1860, 'epoch': 2} {'type': 'loss', 'content': 0.021875588223338127, 'timestamp': '2025-09-04 04:02:00.250938', 'step': 1861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:00.341251', 'step': 1861, 'epoch': 2} {'type': 'loss', 'content': 0.0015288630966097116, 'timestamp': '2025-09-04 04:02:00.358039', 'step': 1862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:00.458610', 'step': 1862, 'epoch': 2} {'type': 'loss', 'content': 0.03855356201529503, 'timestamp': '2025-09-04 04:02:00.477318', 'step': 1863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:02:00.588810', 'step': 1863, 'epoch': 2} {'type': 'loss', 'content': 0.015515622682869434, 'timestamp': '2025-09-04 04:02:00.610288', 'step': 1864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:00.699121', 'step': 1864, 'epoch': 2} {'type': 'loss', 'content': 0.020159801468253136, 'timestamp': '2025-09-04 04:02:00.717609', 'step': 1865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:02:00.826565', 'step': 1865, 'epoch': 2} {'type': 'loss', 'content': 0.0021761921234428883, 'timestamp': '2025-09-04 04:02:00.846815', 'step': 1866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:00.957794', 'step': 1866, 'epoch': 2} {'type': 'loss', 'content': 0.022660445421934128, 'timestamp': '2025-09-04 04:02:00.978402', 'step': 1867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:02:01.062716', 'step': 1867, 'epoch': 2} {'type': 'loss', 'content': 0.004487188998609781, 'timestamp': '2025-09-04 04:02:01.078811', 'step': 1868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:01.169919', 'step': 1868, 'epoch': 2} {'type': 'loss', 'content': 0.034519318491220474, 'timestamp': '2025-09-04 04:02:01.189116', 'step': 1869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:01.300376', 'step': 1869, 'epoch': 2} {'type': 'loss', 'content': 0.0377497524023056, 'timestamp': '2025-09-04 04:02:01.320997', 'step': 1870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:01.415723', 'step': 1870, 'epoch': 2} {'type': 'loss', 'content': 0.015238355845212936, 'timestamp': '2025-09-04 04:02:01.433073', 'step': 1871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:01.535777', 'step': 1871, 'epoch': 2} {'type': 'loss', 'content': 0.012231401167809963, 'timestamp': '2025-09-04 04:02:01.555722', 'step': 1872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:02:01.663789', 'step': 1872, 'epoch': 2} {'type': 'loss', 'content': 0.003201687941327691, 'timestamp': '2025-09-04 04:02:01.686618', 'step': 1873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:01.791786', 'step': 1873, 'epoch': 2} {'type': 'loss', 'content': 0.024663355201482773, 'timestamp': '2025-09-04 04:02:01.811100', 'step': 1874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:02:01.904041', 'step': 1874, 'epoch': 2} {'type': 'loss', 'content': 0.07429840415716171, 'timestamp': '2025-09-04 04:02:01.921296', 'step': 1875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:02.021345', 'step': 1875, 'epoch': 2} {'type': 'loss', 'content': 0.0015133678680285811, 'timestamp': '2025-09-04 04:02:02.040982', 'step': 1876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:02.144440', 'step': 1876, 'epoch': 2} {'type': 'loss', 'content': 0.09027554839849472, 'timestamp': '2025-09-04 04:02:02.165531', 'step': 1877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:02.265048', 'step': 1877, 'epoch': 2} {'type': 'loss', 'content': 0.029700566083192825, 'timestamp': '2025-09-04 04:02:02.283427', 'step': 1878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:02:02.355323', 'step': 1878, 'epoch': 2} {'type': 'loss', 'content': 0.015838583931326866, 'timestamp': '2025-09-04 04:02:02.368341', 'step': 1879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:02.463178', 'step': 1879, 'epoch': 2} {'type': 'loss', 'content': 0.0317532904446125, 'timestamp': '2025-09-04 04:02:02.481273', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:10.983366', 'step': 1880, 'epoch': 2} {'type': 'pplx', 'content': 326.9121941560908, 'timestamp': '2025-09-04 04:02:10.985583', 'step': 1880, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1880', 'timestamp': '2025-09-04 04:02:11.488696', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:11.586020', 'step': 1880, 'epoch': 2} {'type': 'loss', 'content': 0.006327355746179819, 'timestamp': '2025-09-04 04:02:11.606788', 'step': 1881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:02:11.679312', 'step': 1881, 'epoch': 2} {'type': 'loss', 'content': 0.0010608435841277242, 'timestamp': '2025-09-04 04:02:11.692168', 'step': 1882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:11.787450', 'step': 1882, 'epoch': 2} {'type': 'loss', 'content': 0.012697882950305939, 'timestamp': '2025-09-04 04:02:11.804854', 'step': 1883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:11.912598', 'step': 1883, 'epoch': 2} {'type': 'loss', 'content': 0.01790587045252323, 'timestamp': '2025-09-04 04:02:11.933353', 'step': 1884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:12.035106', 'step': 1884, 'epoch': 2} {'type': 'loss', 'content': 0.0027732134331017733, 'timestamp': '2025-09-04 04:02:12.056040', 'step': 1885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:02:12.134830', 'step': 1885, 'epoch': 2} {'type': 'loss', 'content': 0.063087597489357, 'timestamp': '2025-09-04 04:02:12.148978', 'step': 1886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:12.256224', 'step': 1886, 'epoch': 2} {'type': 'loss', 'content': 0.0023315551225095987, 'timestamp': '2025-09-04 04:02:12.276242', 'step': 1887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:12.383110', 'step': 1887, 'epoch': 2} {'type': 'loss', 'content': 0.011332008987665176, 'timestamp': '2025-09-04 04:02:12.403326', 'step': 1888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:12.511780', 'step': 1888, 'epoch': 2} {'type': 'loss', 'content': 0.04571153596043587, 'timestamp': '2025-09-04 04:02:12.534517', 'step': 1889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:02:12.617993', 'step': 1889, 'epoch': 2} {'type': 'loss', 'content': 0.05394493415951729, 'timestamp': '2025-09-04 04:02:12.633177', 'step': 1890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:02:12.716784', 'step': 1890, 'epoch': 2} {'type': 'loss', 'content': 0.007151946425437927, 'timestamp': '2025-09-04 04:02:12.732026', 'step': 1891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:12.832763', 'step': 1891, 'epoch': 2} {'type': 'loss', 'content': 0.03096769005060196, 'timestamp': '2025-09-04 04:02:12.852161', 'step': 1892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:12.953361', 'step': 1892, 'epoch': 2} {'type': 'loss', 'content': 0.019417183473706245, 'timestamp': '2025-09-04 04:02:12.974116', 'step': 1893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:02:13.084289', 'step': 1893, 'epoch': 2} {'type': 'loss', 'content': 0.007123048882931471, 'timestamp': '2025-09-04 04:02:13.104493', 'step': 1894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:13.216318', 'step': 1894, 'epoch': 2} {'type': 'loss', 'content': 0.00961579754948616, 'timestamp': '2025-09-04 04:02:13.236746', 'step': 1895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:13.339570', 'step': 1895, 'epoch': 2} {'type': 'loss', 'content': 0.01128938514739275, 'timestamp': '2025-09-04 04:02:13.359169', 'step': 1896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:13.451654', 'step': 1896, 'epoch': 2} {'type': 'loss', 'content': 0.036590978503227234, 'timestamp': '2025-09-04 04:02:13.470687', 'step': 1897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:13.564383', 'step': 1897, 'epoch': 2} {'type': 'loss', 'content': 0.0244086105376482, 'timestamp': '2025-09-04 04:02:13.581880', 'step': 1898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:13.683164', 'step': 1898, 'epoch': 2} {'type': 'loss', 'content': 0.014544663019478321, 'timestamp': '2025-09-04 04:02:13.701976', 'step': 1899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:02:13.780322', 'step': 1899, 'epoch': 2} {'type': 'loss', 'content': 0.023842498660087585, 'timestamp': '2025-09-04 04:02:13.795174', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:22.378146', 'step': 1900, 'epoch': 2} {'type': 'pplx', 'content': 327.30657016075753, 'timestamp': '2025-09-04 04:02:22.380281', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:02:22.453274', 'step': 1900, 'epoch': 2} {'type': 'loss', 'content': 0.036261361092329025, 'timestamp': '2025-09-04 04:02:22.467795', 'step': 1901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:22.571889', 'step': 1901, 'epoch': 2} {'type': 'loss', 'content': 0.0038794318679720163, 'timestamp': '2025-09-04 04:02:22.590900', 'step': 1902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:22.697502', 'step': 1902, 'epoch': 2} {'type': 'loss', 'content': 0.00663451012223959, 'timestamp': '2025-09-04 04:02:22.715533', 'step': 1903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:22.813576', 'step': 1903, 'epoch': 2} {'type': 'loss', 'content': 0.0043860250152647495, 'timestamp': '2025-09-04 04:02:22.831587', 'step': 1904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:22.934235', 'step': 1904, 'epoch': 2} {'type': 'loss', 'content': 0.04079779237508774, 'timestamp': '2025-09-04 04:02:22.955046', 'step': 1905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:23.051479', 'step': 1905, 'epoch': 2} {'type': 'loss', 'content': 0.011518475599586964, 'timestamp': '2025-09-04 04:02:23.068728', 'step': 1906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:02:23.156438', 'step': 1906, 'epoch': 2} {'type': 'loss', 'content': 0.07115134596824646, 'timestamp': '2025-09-04 04:02:23.171339', 'step': 1907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:23.267997', 'step': 1907, 'epoch': 2} {'type': 'loss', 'content': 0.006972004193812609, 'timestamp': '2025-09-04 04:02:23.286112', 'step': 1908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:02:23.405084', 'step': 1908, 'epoch': 2} {'type': 'loss', 'content': 0.02820487506687641, 'timestamp': '2025-09-04 04:02:23.428196', 'step': 1909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:23.533819', 'step': 1909, 'epoch': 2} {'type': 'loss', 'content': 0.021369347348809242, 'timestamp': '2025-09-04 04:02:23.552962', 'step': 1910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:23.655938', 'step': 1910, 'epoch': 2} {'type': 'loss', 'content': 0.031066900119185448, 'timestamp': '2025-09-04 04:02:23.673941', 'step': 1911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:02:23.753399', 'step': 1911, 'epoch': 2} {'type': 'loss', 'content': 0.007914647459983826, 'timestamp': '2025-09-04 04:02:23.767444', 'step': 1912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:02:23.850430', 'step': 1912, 'epoch': 2} {'type': 'loss', 'content': 0.009337898343801498, 'timestamp': '2025-09-04 04:02:23.866235', 'step': 1913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:02:23.951961', 'step': 1913, 'epoch': 2} {'type': 'loss', 'content': 0.015139404684305191, 'timestamp': '2025-09-04 04:02:23.966375', 'step': 1914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:24.072580', 'step': 1914, 'epoch': 2} {'type': 'loss', 'content': 0.020058702677488327, 'timestamp': '2025-09-04 04:02:24.091184', 'step': 1915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:24.184250', 'step': 1915, 'epoch': 2} {'type': 'loss', 'content': 0.01975717768073082, 'timestamp': '2025-09-04 04:02:24.201038', 'step': 1916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:24.301071', 'step': 1916, 'epoch': 2} {'type': 'loss', 'content': 0.03391404077410698, 'timestamp': '2025-09-04 04:02:24.321252', 'step': 1917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:02:24.447433', 'step': 1917, 'epoch': 2} {'type': 'loss', 'content': 0.02063564583659172, 'timestamp': '2025-09-04 04:02:24.470101', 'step': 1918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:24.576203', 'step': 1918, 'epoch': 2} {'type': 'loss', 'content': 0.0002946726162917912, 'timestamp': '2025-09-04 04:02:24.595051', 'step': 1919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:02:24.719215', 'step': 1919, 'epoch': 2} {'type': 'loss', 'content': 0.005154923070222139, 'timestamp': '2025-09-04 04:02:24.742337', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:33.304746', 'step': 1920, 'epoch': 2} {'type': 'pplx', 'content': 328.53727900122976, 'timestamp': '2025-09-04 04:02:33.308168', 'step': 1920, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1920', 'timestamp': '2025-09-04 04:02:33.808373', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:02:33.892978', 'step': 1920, 'epoch': 2} {'type': 'loss', 'content': 0.049568939954042435, 'timestamp': '2025-09-04 04:02:33.910033', 'step': 1921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:34.011859', 'step': 1921, 'epoch': 2} {'type': 'loss', 'content': 0.03862825781106949, 'timestamp': '2025-09-04 04:02:34.030696', 'step': 1922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:02:34.108930', 'step': 1922, 'epoch': 2} {'type': 'loss', 'content': 0.003806066932156682, 'timestamp': '2025-09-04 04:02:34.123154', 'step': 1923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:02:34.216966', 'step': 1923, 'epoch': 2} {'type': 'loss', 'content': 0.020454682409763336, 'timestamp': '2025-09-04 04:02:34.234930', 'step': 1924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:34.350041', 'step': 1924, 'epoch': 2} {'type': 'loss', 'content': 0.056430667638778687, 'timestamp': '2025-09-04 04:02:34.369848', 'step': 1925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:34.501762', 'step': 1925, 'epoch': 2} {'type': 'loss', 'content': 0.007279661018401384, 'timestamp': '2025-09-04 04:02:34.520863', 'step': 1926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:34.658902', 'step': 1926, 'epoch': 2} {'type': 'loss', 'content': 0.011289707385003567, 'timestamp': '2025-09-04 04:02:34.676419', 'step': 1927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:34.772220', 'step': 1927, 'epoch': 2} {'type': 'loss', 'content': 0.0045151012018322945, 'timestamp': '2025-09-04 04:02:34.790651', 'step': 1928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:34.879643', 'step': 1928, 'epoch': 2} {'type': 'loss', 'content': 0.016711879521608353, 'timestamp': '2025-09-04 04:02:34.897944', 'step': 1929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:02:34.977184', 'step': 1929, 'epoch': 2} {'type': 'loss', 'content': 0.011652039363980293, 'timestamp': '2025-09-04 04:02:34.991213', 'step': 1930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:35.084798', 'step': 1930, 'epoch': 2} {'type': 'loss', 'content': 0.00244183954782784, 'timestamp': '2025-09-04 04:02:35.102275', 'step': 1931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:02:35.181891', 'step': 1931, 'epoch': 2} {'type': 'loss', 'content': 0.004456162918359041, 'timestamp': '2025-09-04 04:02:35.196813', 'step': 1932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:02:35.294174', 'step': 1932, 'epoch': 2} {'type': 'loss', 'content': 0.018451880663633347, 'timestamp': '2025-09-04 04:02:35.313037', 'step': 1933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:35.414679', 'step': 1933, 'epoch': 2} {'type': 'loss', 'content': 0.011028347536921501, 'timestamp': '2025-09-04 04:02:35.434011', 'step': 1934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:02:35.571788', 'step': 1934, 'epoch': 2} {'type': 'loss', 'content': 0.0028660639654845, 'timestamp': '2025-09-04 04:02:35.598124', 'step': 1935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:35.698343', 'step': 1935, 'epoch': 2} {'type': 'loss', 'content': 0.03565828502178192, 'timestamp': '2025-09-04 04:02:35.718091', 'step': 1936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:35.818676', 'step': 1936, 'epoch': 2} {'type': 'loss', 'content': 0.10143911838531494, 'timestamp': '2025-09-04 04:02:35.839976', 'step': 1937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:35.940531', 'step': 1937, 'epoch': 2} {'type': 'loss', 'content': 0.01908188872039318, 'timestamp': '2025-09-04 04:02:35.959130', 'step': 1938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:36.063139', 'step': 1938, 'epoch': 2} {'type': 'loss', 'content': 0.018672922626137733, 'timestamp': '2025-09-04 04:02:36.082502', 'step': 1939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:36.186225', 'step': 1939, 'epoch': 2} {'type': 'loss', 'content': 0.002945462241768837, 'timestamp': '2025-09-04 04:02:36.206294', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:44.692002', 'step': 1940, 'epoch': 2} {'type': 'pplx', 'content': 328.19710857649045, 'timestamp': '2025-09-04 04:02:44.693795', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:02:44.760706', 'step': 1940, 'epoch': 2} {'type': 'loss', 'content': 0.006576020736247301, 'timestamp': '2025-09-04 04:02:44.774394', 'step': 1941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:02:44.852530', 'step': 1941, 'epoch': 2} {'type': 'loss', 'content': 0.0037395476829260588, 'timestamp': '2025-09-04 04:02:44.866793', 'step': 1942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:44.966761', 'step': 1942, 'epoch': 2} {'type': 'loss', 'content': 0.008964164182543755, 'timestamp': '2025-09-04 04:02:44.985466', 'step': 1943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:45.085746', 'step': 1943, 'epoch': 2} {'type': 'loss', 'content': 0.010037235915660858, 'timestamp': '2025-09-04 04:02:45.105491', 'step': 1944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:45.204877', 'step': 1944, 'epoch': 2} {'type': 'loss', 'content': 0.03502393513917923, 'timestamp': '2025-09-04 04:02:45.226011', 'step': 1945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:45.335742', 'step': 1945, 'epoch': 2} {'type': 'loss', 'content': 0.007905172184109688, 'timestamp': '2025-09-04 04:02:45.356347', 'step': 1946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:45.452352', 'step': 1946, 'epoch': 2} {'type': 'loss', 'content': 0.023761700838804245, 'timestamp': '2025-09-04 04:02:45.470038', 'step': 1947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:02:45.579975', 'step': 1947, 'epoch': 2} {'type': 'loss', 'content': 0.0012213548179715872, 'timestamp': '2025-09-04 04:02:45.601147', 'step': 1948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:45.693335', 'step': 1948, 'epoch': 2} {'type': 'loss', 'content': 0.0033678747713565826, 'timestamp': '2025-09-04 04:02:45.712436', 'step': 1949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:45.807324', 'step': 1949, 'epoch': 2} {'type': 'loss', 'content': 0.0012253863969817758, 'timestamp': '2025-09-04 04:02:45.824862', 'step': 1950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:45.933914', 'step': 1950, 'epoch': 2} {'type': 'loss', 'content': 0.004889811389148235, 'timestamp': '2025-09-04 04:02:45.954376', 'step': 1951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:46.055612', 'step': 1951, 'epoch': 2} {'type': 'loss', 'content': 0.025324570015072823, 'timestamp': '2025-09-04 04:02:46.074986', 'step': 1952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:46.167876', 'step': 1952, 'epoch': 2} {'type': 'loss', 'content': 0.006361103150993586, 'timestamp': '2025-09-04 04:02:46.186888', 'step': 1953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:02:46.288522', 'step': 1953, 'epoch': 2} {'type': 'loss', 'content': 0.019413141533732414, 'timestamp': '2025-09-04 04:02:46.307129', 'step': 1954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:46.409295', 'step': 1954, 'epoch': 2} {'type': 'loss', 'content': 0.005714345257729292, 'timestamp': '2025-09-04 04:02:46.426613', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:02:46.513059', 'step': 1955, 'epoch': 2} {'type': 'loss', 'content': 0.056890930980443954, 'timestamp': '2025-09-04 04:02:46.529361', 'step': 1956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:02:46.603085', 'step': 1956, 'epoch': 2} {'type': 'loss', 'content': 0.049041591584682465, 'timestamp': '2025-09-04 04:02:46.618188', 'step': 1957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:02:46.728342', 'step': 1957, 'epoch': 2} {'type': 'loss', 'content': 0.021011320874094963, 'timestamp': '2025-09-04 04:02:46.748819', 'step': 1958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:46.843211', 'step': 1958, 'epoch': 2} {'type': 'loss', 'content': 0.004129297100007534, 'timestamp': '2025-09-04 04:02:46.860665', 'step': 1959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:46.967025', 'step': 1959, 'epoch': 2} {'type': 'loss', 'content': 0.0063214851543307304, 'timestamp': '2025-09-04 04:02:46.987480', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:02:55.500629', 'step': 1960, 'epoch': 2} {'type': 'pplx', 'content': 328.783539244894, 'timestamp': '2025-09-04 04:02:55.502583', 'step': 1960, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1960', 'timestamp': '2025-09-04 04:02:56.015154', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:56.116017', 'step': 1960, 'epoch': 2} {'type': 'loss', 'content': 0.02819206565618515, 'timestamp': '2025-09-04 04:02:56.137065', 'step': 1961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:02:56.227546', 'step': 1961, 'epoch': 2} {'type': 'loss', 'content': 0.0029589931946247816, 'timestamp': '2025-09-04 04:02:56.244287', 'step': 1962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:56.343431', 'step': 1962, 'epoch': 2} {'type': 'loss', 'content': 0.01074980664998293, 'timestamp': '2025-09-04 04:02:56.362178', 'step': 1963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:02:56.461841', 'step': 1963, 'epoch': 2} {'type': 'loss', 'content': 0.010816739872097969, 'timestamp': '2025-09-04 04:02:56.481319', 'step': 1964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:02:56.585680', 'step': 1964, 'epoch': 2} {'type': 'loss', 'content': 0.008484927006065845, 'timestamp': '2025-09-04 04:02:56.607894', 'step': 1965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:56.704314', 'step': 1965, 'epoch': 2} {'type': 'loss', 'content': 0.018219899386167526, 'timestamp': '2025-09-04 04:02:56.721834', 'step': 1966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:02:56.815874', 'step': 1966, 'epoch': 2} {'type': 'loss', 'content': 0.011172082275152206, 'timestamp': '2025-09-04 04:02:56.833246', 'step': 1967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:02:56.941573', 'step': 1967, 'epoch': 2} {'type': 'loss', 'content': 0.015450472943484783, 'timestamp': '2025-09-04 04:02:56.962671', 'step': 1968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:57.066734', 'step': 1968, 'epoch': 2} {'type': 'loss', 'content': 0.036945831030607224, 'timestamp': '2025-09-04 04:02:57.088685', 'step': 1969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:02:57.160350', 'step': 1969, 'epoch': 2} {'type': 'loss', 'content': 0.0042557851411402225, 'timestamp': '2025-09-04 04:02:57.173388', 'step': 1970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:02:57.258959', 'step': 1970, 'epoch': 2} {'type': 'loss', 'content': 0.019967155531048775, 'timestamp': '2025-09-04 04:02:57.274551', 'step': 1971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:57.381802', 'step': 1971, 'epoch': 2} {'type': 'loss', 'content': 0.012040197849273682, 'timestamp': '2025-09-04 04:02:57.402724', 'step': 1972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:02:57.493730', 'step': 1972, 'epoch': 2} {'type': 'loss', 'content': 0.01980067417025566, 'timestamp': '2025-09-04 04:02:57.512481', 'step': 1973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:02:57.605693', 'step': 1973, 'epoch': 2} {'type': 'loss', 'content': 0.008740060031414032, 'timestamp': '2025-09-04 04:02:57.622984', 'step': 1974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:02:57.727696', 'step': 1974, 'epoch': 2} {'type': 'loss', 'content': 0.005561890080571175, 'timestamp': '2025-09-04 04:02:57.747152', 'step': 1975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:02:57.843200', 'step': 1975, 'epoch': 2} {'type': 'loss', 'content': 0.026673782616853714, 'timestamp': '2025-09-04 04:02:57.861480', 'step': 1976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:02:57.964421', 'step': 1976, 'epoch': 2} {'type': 'loss', 'content': 0.06876686215400696, 'timestamp': '2025-09-04 04:02:57.986466', 'step': 1977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 04:02:58.148841', 'step': 1977, 'epoch': 2} {'type': 'loss', 'content': 0.005590865388512611, 'timestamp': '2025-09-04 04:02:58.180782', 'step': 1978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:02:58.264369', 'step': 1978, 'epoch': 2} {'type': 'loss', 'content': 0.004839139059185982, 'timestamp': '2025-09-04 04:02:58.279672', 'step': 1979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:02:58.383284', 'step': 1979, 'epoch': 2} {'type': 'loss', 'content': 0.02411733940243721, 'timestamp': '2025-09-04 04:02:58.403072', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:03:06.910649', 'step': 1980, 'epoch': 2} {'type': 'pplx', 'content': 331.13313135590505, 'timestamp': '2025-09-04 04:03:06.913387', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:03:07.009201', 'step': 1980, 'epoch': 2} {'type': 'loss', 'content': 0.039461344480514526, 'timestamp': '2025-09-04 04:03:07.029715', 'step': 1981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:07.132787', 'step': 1981, 'epoch': 2} {'type': 'loss', 'content': 0.013207260519266129, 'timestamp': '2025-09-04 04:03:07.151959', 'step': 1982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:07.256472', 'step': 1982, 'epoch': 2} {'type': 'loss', 'content': 0.029344527050852776, 'timestamp': '2025-09-04 04:03:07.275850', 'step': 1983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:07.380127', 'step': 1983, 'epoch': 2} {'type': 'loss', 'content': 0.0013410469982773066, 'timestamp': '2025-09-04 04:03:07.400223', 'step': 1984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:03:07.491158', 'step': 1984, 'epoch': 2} {'type': 'loss', 'content': 0.01028304360806942, 'timestamp': '2025-09-04 04:03:07.510046', 'step': 1985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:03:07.627621', 'step': 1985, 'epoch': 2} {'type': 'loss', 'content': 0.023101340979337692, 'timestamp': '2025-09-04 04:03:07.649726', 'step': 1986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:07.741169', 'step': 1986, 'epoch': 2} {'type': 'loss', 'content': 0.012508483603596687, 'timestamp': '2025-09-04 04:03:07.757857', 'step': 1987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:03:07.857215', 'step': 1987, 'epoch': 2} {'type': 'loss', 'content': 0.00880645215511322, 'timestamp': '2025-09-04 04:03:07.875535', 'step': 1988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:03:07.992983', 'step': 1988, 'epoch': 2} {'type': 'loss', 'content': 0.00218992680311203, 'timestamp': '2025-09-04 04:03:08.016938', 'step': 1989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:08.124746', 'step': 1989, 'epoch': 2} {'type': 'loss', 'content': 0.05591468885540962, 'timestamp': '2025-09-04 04:03:08.144878', 'step': 1990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:08.223275', 'step': 1990, 'epoch': 2} {'type': 'loss', 'content': 0.011654703877866268, 'timestamp': '2025-09-04 04:03:08.237515', 'step': 1991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:03:08.320578', 'step': 1991, 'epoch': 2} {'type': 'loss', 'content': 0.012211965397000313, 'timestamp': '2025-09-04 04:03:08.336518', 'step': 1992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:03:08.418294', 'step': 1992, 'epoch': 2} {'type': 'loss', 'content': 0.006080134306102991, 'timestamp': '2025-09-04 04:03:08.435040', 'step': 1993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:08.541873', 'step': 1993, 'epoch': 2} {'type': 'loss', 'content': 0.0031754986848682165, 'timestamp': '2025-09-04 04:03:08.561950', 'step': 1994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:03:08.697575', 'step': 1994, 'epoch': 2} {'type': 'loss', 'content': 0.022798430174589157, 'timestamp': '2025-09-04 04:03:08.723260', 'step': 1995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:03:08.823733', 'step': 1995, 'epoch': 2} {'type': 'loss', 'content': 0.055928491055965424, 'timestamp': '2025-09-04 04:03:08.840166', 'step': 1996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:03:08.913556', 'step': 1996, 'epoch': 2} {'type': 'loss', 'content': 0.006855425424873829, 'timestamp': '2025-09-04 04:03:08.928424', 'step': 1997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:03:09.000715', 'step': 1997, 'epoch': 2} {'type': 'loss', 'content': 0.022756323218345642, 'timestamp': '2025-09-04 04:03:09.013661', 'step': 1998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:03:09.108477', 'step': 1998, 'epoch': 2} {'type': 'loss', 'content': 0.01630261540412903, 'timestamp': '2025-09-04 04:03:09.125996', 'step': 1999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:03:09.219144', 'step': 1999, 'epoch': 2} {'type': 'loss', 'content': 0.00793174933642149, 'timestamp': '2025-09-04 04:03:09.237199', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:03:17.717901', 'step': 2000, 'epoch': 2} {'type': 'pplx', 'content': 337.7910185199579, 'timestamp': '2025-09-04 04:03:17.719822', 'step': 2000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-04 04:03:18.081079', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:18.183814', 'step': 2000, 'epoch': 2} {'type': 'loss', 'content': 0.0038866132963448763, 'timestamp': '2025-09-04 04:03:18.205004', 'step': 2001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:18.311859', 'step': 2001, 'epoch': 2} {'type': 'loss', 'content': 0.029996544122695923, 'timestamp': '2025-09-04 04:03:18.331766', 'step': 2002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:18.412991', 'step': 2002, 'epoch': 2} {'type': 'loss', 'content': 0.029153253883123398, 'timestamp': '2025-09-04 04:03:18.426980', 'step': 2003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:18.519253', 'step': 2003, 'epoch': 2} {'type': 'loss', 'content': 0.059160567820072174, 'timestamp': '2025-09-04 04:03:18.536705', 'step': 2004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:18.636731', 'step': 2004, 'epoch': 2} {'type': 'loss', 'content': 0.02167946845293045, 'timestamp': '2025-09-04 04:03:18.657389', 'step': 2005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:18.761805', 'step': 2005, 'epoch': 2} {'type': 'loss', 'content': 0.0188386719673872, 'timestamp': '2025-09-04 04:03:18.780867', 'step': 2006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:18.887731', 'step': 2006, 'epoch': 2} {'type': 'loss', 'content': 0.0005847454303875566, 'timestamp': '2025-09-04 04:03:18.906473', 'step': 2007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:18.987245', 'step': 2007, 'epoch': 2} {'type': 'loss', 'content': 0.008029515855014324, 'timestamp': '2025-09-04 04:03:19.002000', 'step': 2008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:03:19.077244', 'step': 2008, 'epoch': 2} {'type': 'loss', 'content': 0.007086843717843294, 'timestamp': '2025-09-04 04:03:19.091979', 'step': 2009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:19.214193', 'step': 2009, 'epoch': 2} {'type': 'loss', 'content': 0.008636104874312878, 'timestamp': '2025-09-04 04:03:19.233921', 'step': 2010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:19.346341', 'step': 2010, 'epoch': 2} {'type': 'loss', 'content': 0.00841162633150816, 'timestamp': '2025-09-04 04:03:19.366348', 'step': 2011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:19.478086', 'step': 2011, 'epoch': 2} {'type': 'loss', 'content': 0.005192655138671398, 'timestamp': '2025-09-04 04:03:19.499294', 'step': 2012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:03:19.584853', 'step': 2012, 'epoch': 2} {'type': 'loss', 'content': 0.016467098146677017, 'timestamp': '2025-09-04 04:03:19.601656', 'step': 2013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:19.714091', 'step': 2013, 'epoch': 2} {'type': 'loss', 'content': 0.004678299650549889, 'timestamp': '2025-09-04 04:03:19.734578', 'step': 2014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:19.829432', 'step': 2014, 'epoch': 2} {'type': 'loss', 'content': 0.038956332951784134, 'timestamp': '2025-09-04 04:03:19.846129', 'step': 2015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:03:19.946809', 'step': 2015, 'epoch': 2} {'type': 'loss', 'content': 0.013382869772613049, 'timestamp': '2025-09-04 04:03:19.966059', 'step': 2016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:20.056506', 'step': 2016, 'epoch': 2} {'type': 'loss', 'content': 0.006615063641220331, 'timestamp': '2025-09-04 04:03:20.074773', 'step': 2017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:03:20.154212', 'step': 2017, 'epoch': 2} {'type': 'loss', 'content': 0.035883184522390366, 'timestamp': '2025-09-04 04:03:20.168191', 'step': 2018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:20.272520', 'step': 2018, 'epoch': 2} {'type': 'loss', 'content': 0.0023406874388456345, 'timestamp': '2025-09-04 04:03:20.291716', 'step': 2019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:20.383383', 'step': 2019, 'epoch': 2} {'type': 'loss', 'content': 0.09516555815935135, 'timestamp': '2025-09-04 04:03:20.400858', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:03:28.856700', 'step': 2020, 'epoch': 2} {'type': 'pplx', 'content': 339.743606509487, 'timestamp': '2025-09-04 04:03:28.858645', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:03:28.975803', 'step': 2020, 'epoch': 2} {'type': 'loss', 'content': 0.0009116530418395996, 'timestamp': '2025-09-04 04:03:29.001353', 'step': 2021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:03:29.087980', 'step': 2021, 'epoch': 2} {'type': 'loss', 'content': 0.0017411328153684735, 'timestamp': '2025-09-04 04:03:29.103549', 'step': 2022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:03:29.198141', 'step': 2022, 'epoch': 2} {'type': 'loss', 'content': 0.018314287066459656, 'timestamp': '2025-09-04 04:03:29.215622', 'step': 2023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:03:29.316793', 'step': 2023, 'epoch': 2} {'type': 'loss', 'content': 0.004151246044784784, 'timestamp': '2025-09-04 04:03:29.336267', 'step': 2024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:29.412528', 'step': 2024, 'epoch': 2} {'type': 'loss', 'content': 0.03218241408467293, 'timestamp': '2025-09-04 04:03:29.428024', 'step': 2025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:29.532196', 'step': 2025, 'epoch': 2} {'type': 'loss', 'content': 0.015404434874653816, 'timestamp': '2025-09-04 04:03:29.551399', 'step': 2026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:03:29.627359', 'step': 2026, 'epoch': 2} {'type': 'loss', 'content': 0.057021476328372955, 'timestamp': '2025-09-04 04:03:29.641204', 'step': 2027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:29.746873', 'step': 2027, 'epoch': 2} {'type': 'loss', 'content': 0.0008856714703142643, 'timestamp': '2025-09-04 04:03:29.767001', 'step': 2028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:03:29.841846', 'step': 2028, 'epoch': 2} {'type': 'loss', 'content': 0.022210262715816498, 'timestamp': '2025-09-04 04:03:29.857048', 'step': 2029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:29.967779', 'step': 2029, 'epoch': 2} {'type': 'loss', 'content': 0.005982580129057169, 'timestamp': '2025-09-04 04:03:29.988354', 'step': 2030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:03:30.074993', 'step': 2030, 'epoch': 2} {'type': 'loss', 'content': 0.0008112862706184387, 'timestamp': '2025-09-04 04:03:30.090723', 'step': 2031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 04:03:30.215707', 'step': 2031, 'epoch': 2} {'type': 'loss', 'content': 0.02068488672375679, 'timestamp': '2025-09-04 04:03:30.240475', 'step': 2032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:03:30.331022', 'step': 2032, 'epoch': 2} {'type': 'loss', 'content': 0.03741706535220146, 'timestamp': '2025-09-04 04:03:30.349873', 'step': 2033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:03:30.432706', 'step': 2033, 'epoch': 2} {'type': 'loss', 'content': 0.05829369276762009, 'timestamp': '2025-09-04 04:03:30.447994', 'step': 2034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:03:30.540942', 'step': 2034, 'epoch': 2} {'type': 'loss', 'content': 0.008479294367134571, 'timestamp': '2025-09-04 04:03:30.558435', 'step': 2035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:30.668268', 'step': 2035, 'epoch': 2} {'type': 'loss', 'content': 0.04389806091785431, 'timestamp': '2025-09-04 04:03:30.689495', 'step': 2036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:30.787055', 'step': 2036, 'epoch': 2} {'type': 'loss', 'content': 0.02469690330326557, 'timestamp': '2025-09-04 04:03:30.807747', 'step': 2037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:30.911430', 'step': 2037, 'epoch': 2} {'type': 'loss', 'content': 0.02199394628405571, 'timestamp': '2025-09-04 04:03:30.930735', 'step': 2038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:31.033799', 'step': 2038, 'epoch': 2} {'type': 'loss', 'content': 0.0037276356015354395, 'timestamp': '2025-09-04 04:03:31.052730', 'step': 2039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:31.131633', 'step': 2039, 'epoch': 2} {'type': 'loss', 'content': 0.021283473819494247, 'timestamp': '2025-09-04 04:03:31.146545', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:03:39.511875', 'step': 2040, 'epoch': 2} {'type': 'pplx', 'content': 334.85919534739503, 'timestamp': '2025-09-04 04:03:39.513963', 'step': 2040, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2040', 'timestamp': '2025-09-04 04:03:40.015188', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:40.116989', 'step': 2040, 'epoch': 2} {'type': 'loss', 'content': 0.02683926559984684, 'timestamp': '2025-09-04 04:03:40.138785', 'step': 2041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:40.248182', 'step': 2041, 'epoch': 2} {'type': 'loss', 'content': 0.0386735163629055, 'timestamp': '2025-09-04 04:03:40.268736', 'step': 2042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:40.369113', 'step': 2042, 'epoch': 2} {'type': 'loss', 'content': 0.00885379035025835, 'timestamp': '2025-09-04 04:03:40.387946', 'step': 2043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:03:40.484312', 'step': 2043, 'epoch': 2} {'type': 'loss', 'content': 0.006810452789068222, 'timestamp': '2025-09-04 04:03:40.502525', 'step': 2044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:40.590982', 'step': 2044, 'epoch': 2} {'type': 'loss', 'content': 0.022753320634365082, 'timestamp': '2025-09-04 04:03:40.609346', 'step': 2045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:03:40.684482', 'step': 2045, 'epoch': 2} {'type': 'loss', 'content': 0.051130082458257675, 'timestamp': '2025-09-04 04:03:40.698277', 'step': 2046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:03:40.806236', 'step': 2046, 'epoch': 2} {'type': 'loss', 'content': 0.028301551938056946, 'timestamp': '2025-09-04 04:03:40.826395', 'step': 2047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:03:40.942751', 'step': 2047, 'epoch': 2} {'type': 'loss', 'content': 0.00401369109749794, 'timestamp': '2025-09-04 04:03:40.965669', 'step': 2048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:03:41.073652', 'step': 2048, 'epoch': 2} {'type': 'loss', 'content': 0.008323295041918755, 'timestamp': '2025-09-04 04:03:41.096381', 'step': 2049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:03:41.172951', 'step': 2049, 'epoch': 2} {'type': 'loss', 'content': 0.010393408127129078, 'timestamp': '2025-09-04 04:03:41.186961', 'step': 2050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:41.276328', 'step': 2050, 'epoch': 2} {'type': 'loss', 'content': 0.0038162292912602425, 'timestamp': '2025-09-04 04:03:41.293090', 'step': 2051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:03:41.402980', 'step': 2051, 'epoch': 2} {'type': 'loss', 'content': 0.0021734382025897503, 'timestamp': '2025-09-04 04:03:41.424184', 'step': 2052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:41.522034', 'step': 2052, 'epoch': 2} {'type': 'loss', 'content': 0.036829832941293716, 'timestamp': '2025-09-04 04:03:41.542779', 'step': 2053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:03:41.650754', 'step': 2053, 'epoch': 2} {'type': 'loss', 'content': 0.015699857845902443, 'timestamp': '2025-09-04 04:03:41.670987', 'step': 2054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:03:41.754688', 'step': 2054, 'epoch': 2} {'type': 'loss', 'content': 0.014599669724702835, 'timestamp': '2025-09-04 04:03:41.769707', 'step': 2055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:41.877413', 'step': 2055, 'epoch': 2} {'type': 'loss', 'content': 0.029209831729531288, 'timestamp': '2025-09-04 04:03:41.898633', 'step': 2056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:03:41.974188', 'step': 2056, 'epoch': 2} {'type': 'loss', 'content': 0.022812718525528908, 'timestamp': '2025-09-04 04:03:41.989608', 'step': 2057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:42.092066', 'step': 2057, 'epoch': 2} {'type': 'loss', 'content': 0.013986658304929733, 'timestamp': '2025-09-04 04:03:42.111323', 'step': 2058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:03:42.199588', 'step': 2058, 'epoch': 2} {'type': 'loss', 'content': 0.055983904749155045, 'timestamp': '2025-09-04 04:03:42.215227', 'step': 2059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:03:42.319945', 'step': 2059, 'epoch': 2} {'type': 'loss', 'content': 0.007262484170496464, 'timestamp': '2025-09-04 04:03:42.339917', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:03:50.711637', 'step': 2060, 'epoch': 2} {'type': 'pplx', 'content': 326.4215391255511, 'timestamp': '2025-09-04 04:03:50.713840', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:03:50.791986', 'step': 2060, 'epoch': 2} {'type': 'loss', 'content': 0.009184667840600014, 'timestamp': '2025-09-04 04:03:50.808350', 'step': 2061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:50.919805', 'step': 2061, 'epoch': 2} {'type': 'loss', 'content': 0.07893642038106918, 'timestamp': '2025-09-04 04:03:50.940465', 'step': 2062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:03:51.033591', 'step': 2062, 'epoch': 2} {'type': 'loss', 'content': 0.008863512426614761, 'timestamp': '2025-09-04 04:03:51.050975', 'step': 2063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:51.151163', 'step': 2063, 'epoch': 2} {'type': 'loss', 'content': 0.007501260843127966, 'timestamp': '2025-09-04 04:03:51.170850', 'step': 2064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:03:51.267183', 'step': 2064, 'epoch': 2} {'type': 'loss', 'content': 0.0035026571713387966, 'timestamp': '2025-09-04 04:03:51.287661', 'step': 2065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:03:51.391160', 'step': 2065, 'epoch': 2} {'type': 'loss', 'content': 0.010278237983584404, 'timestamp': '2025-09-04 04:03:51.408252', 'step': 2066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:51.507168', 'step': 2066, 'epoch': 2} {'type': 'loss', 'content': 0.042156293988227844, 'timestamp': '2025-09-04 04:03:51.526171', 'step': 2067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:03:51.627938', 'step': 2067, 'epoch': 2} {'type': 'loss', 'content': 0.024234874173998833, 'timestamp': '2025-09-04 04:03:51.647756', 'step': 2068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:03:51.752621', 'step': 2068, 'epoch': 2} {'type': 'loss', 'content': 0.0024930352810770273, 'timestamp': '2025-09-04 04:03:51.775261', 'step': 2069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:03:51.883717', 'step': 2069, 'epoch': 2} {'type': 'loss', 'content': 0.043505195528268814, 'timestamp': '2025-09-04 04:03:51.903987', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:52.014076', 'step': 2070, 'epoch': 2} {'type': 'loss', 'content': 0.006645853631198406, 'timestamp': '2025-09-04 04:03:52.033063', 'step': 2071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:03:52.104148', 'step': 2071, 'epoch': 2} {'type': 'loss', 'content': 0.02741372399032116, 'timestamp': '2025-09-04 04:03:52.117669', 'step': 2072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:03:52.210557', 'step': 2072, 'epoch': 2} {'type': 'loss', 'content': 0.0004673259099945426, 'timestamp': '2025-09-04 04:03:52.229709', 'step': 2073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:03:52.335470', 'step': 2073, 'epoch': 2} {'type': 'loss', 'content': 0.023968873545527458, 'timestamp': '2025-09-04 04:03:52.355570', 'step': 2074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:03:52.490754', 'step': 2074, 'epoch': 2} {'type': 'loss', 'content': 0.011759743094444275, 'timestamp': '2025-09-04 04:03:52.517067', 'step': 2075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:03:52.629680', 'step': 2075, 'epoch': 2} {'type': 'loss', 'content': 0.002521298360079527, 'timestamp': '2025-09-04 04:03:52.651213', 'step': 2076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:03:52.733607', 'step': 2076, 'epoch': 2} {'type': 'loss', 'content': 0.019572464749217033, 'timestamp': '2025-09-04 04:03:52.750683', 'step': 2077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:03:52.840487', 'step': 2077, 'epoch': 2} {'type': 'loss', 'content': 0.002779679372906685, 'timestamp': '2025-09-04 04:03:52.857245', 'step': 2078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:52.957934', 'step': 2078, 'epoch': 2} {'type': 'loss', 'content': 0.026165010407567024, 'timestamp': '2025-09-04 04:03:52.976685', 'step': 2079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:03:53.076614', 'step': 2079, 'epoch': 2} {'type': 'loss', 'content': 0.004176696762442589, 'timestamp': '2025-09-04 04:03:53.096246', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:01.472371', 'step': 2080, 'epoch': 2} {'type': 'pplx', 'content': 320.5430643577628, 'timestamp': '2025-09-04 04:04:01.474232', 'step': 2080, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2080', 'timestamp': '2025-09-04 04:04:01.816278', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:04:01.899311', 'step': 2080, 'epoch': 2} {'type': 'loss', 'content': 0.0009140381007455289, 'timestamp': '2025-09-04 04:04:01.916437', 'step': 2081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:02.017975', 'step': 2081, 'epoch': 2} {'type': 'loss', 'content': 0.0035500312224030495, 'timestamp': '2025-09-04 04:04:02.036771', 'step': 2082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:04:02.123421', 'step': 2082, 'epoch': 2} {'type': 'loss', 'content': 0.06194588169455528, 'timestamp': '2025-09-04 04:04:02.139060', 'step': 2083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:02.234678', 'step': 2083, 'epoch': 2} {'type': 'loss', 'content': 0.08327718824148178, 'timestamp': '2025-09-04 04:04:02.252937', 'step': 2084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:04:02.367288', 'step': 2084, 'epoch': 2} {'type': 'loss', 'content': 0.0026233464013785124, 'timestamp': '2025-09-04 04:04:02.391497', 'step': 2085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:02.491052', 'step': 2085, 'epoch': 2} {'type': 'loss', 'content': 0.13432270288467407, 'timestamp': '2025-09-04 04:04:02.509568', 'step': 2086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:04:02.595298', 'step': 2086, 'epoch': 2} {'type': 'loss', 'content': 0.026126880198717117, 'timestamp': '2025-09-04 04:04:02.610922', 'step': 2087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:04:02.732732', 'step': 2087, 'epoch': 2} {'type': 'loss', 'content': 0.001551034045405686, 'timestamp': '2025-09-04 04:04:02.756661', 'step': 2088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:02.857941', 'step': 2088, 'epoch': 2} {'type': 'loss', 'content': 0.002250077435746789, 'timestamp': '2025-09-04 04:04:02.879175', 'step': 2089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:02.980040', 'step': 2089, 'epoch': 2} {'type': 'loss', 'content': 0.03148679807782173, 'timestamp': '2025-09-04 04:04:02.998944', 'step': 2090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:03.093272', 'step': 2090, 'epoch': 2} {'type': 'loss', 'content': 0.01184455119073391, 'timestamp': '2025-09-04 04:04:03.110673', 'step': 2091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:03.211623', 'step': 2091, 'epoch': 2} {'type': 'loss', 'content': 0.0011833092430606484, 'timestamp': '2025-09-04 04:04:03.231212', 'step': 2092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:03.324401', 'step': 2092, 'epoch': 2} {'type': 'loss', 'content': 0.003595761489123106, 'timestamp': '2025-09-04 04:04:03.343596', 'step': 2093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:03.437989', 'step': 2093, 'epoch': 2} {'type': 'loss', 'content': 0.027849143370985985, 'timestamp': '2025-09-04 04:04:03.455117', 'step': 2094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:03.556386', 'step': 2094, 'epoch': 2} {'type': 'loss', 'content': 0.022928999736905098, 'timestamp': '2025-09-04 04:04:03.575190', 'step': 2095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:03.668150', 'step': 2095, 'epoch': 2} {'type': 'loss', 'content': 0.07359001040458679, 'timestamp': '2025-09-04 04:04:03.686058', 'step': 2096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:03.789756', 'step': 2096, 'epoch': 2} {'type': 'loss', 'content': 0.05725327506661415, 'timestamp': '2025-09-04 04:04:03.811698', 'step': 2097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:03.915279', 'step': 2097, 'epoch': 2} {'type': 'loss', 'content': 0.04447811469435692, 'timestamp': '2025-09-04 04:04:03.934528', 'step': 2098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:04.039378', 'step': 2098, 'epoch': 2} {'type': 'loss', 'content': 0.0025624537374824286, 'timestamp': '2025-09-04 04:04:04.057979', 'step': 2099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:04.158399', 'step': 2099, 'epoch': 2} {'type': 'loss', 'content': 0.0035990336909890175, 'timestamp': '2025-09-04 04:04:04.178016', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:12.532858', 'step': 2100, 'epoch': 2} {'type': 'pplx', 'content': 314.23096938736927, 'timestamp': '2025-09-04 04:04:12.534871', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:12.620879', 'step': 2100, 'epoch': 2} {'type': 'loss', 'content': 0.003338422393426299, 'timestamp': '2025-09-04 04:04:12.639036', 'step': 2101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:04:12.731933', 'step': 2101, 'epoch': 2} {'type': 'loss', 'content': 0.002954112831503153, 'timestamp': '2025-09-04 04:04:12.749327', 'step': 2102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:12.850749', 'step': 2102, 'epoch': 2} {'type': 'loss', 'content': 0.009453012607991695, 'timestamp': '2025-09-04 04:04:12.869799', 'step': 2103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:04:13.007718', 'step': 2103, 'epoch': 2} {'type': 'loss', 'content': 0.020522311329841614, 'timestamp': '2025-09-04 04:04:13.034524', 'step': 2104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:13.121973', 'step': 2104, 'epoch': 2} {'type': 'loss', 'content': 0.024143625050783157, 'timestamp': '2025-09-04 04:04:13.140329', 'step': 2105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:13.232648', 'step': 2105, 'epoch': 2} {'type': 'loss', 'content': 0.00796930119395256, 'timestamp': '2025-09-04 04:04:13.249377', 'step': 2106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:13.351215', 'step': 2106, 'epoch': 2} {'type': 'loss', 'content': 0.000632771581877023, 'timestamp': '2025-09-04 04:04:13.370206', 'step': 2107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:13.460383', 'step': 2107, 'epoch': 2} {'type': 'loss', 'content': 0.002220430178567767, 'timestamp': '2025-09-04 04:04:13.478003', 'step': 2108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:13.583120', 'step': 2108, 'epoch': 2} {'type': 'loss', 'content': 0.01635553501546383, 'timestamp': '2025-09-04 04:04:13.604209', 'step': 2109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:13.710252', 'step': 2109, 'epoch': 2} {'type': 'loss', 'content': 0.017017148435115814, 'timestamp': '2025-09-04 04:04:13.729384', 'step': 2110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:04:13.806648', 'step': 2110, 'epoch': 2} {'type': 'loss', 'content': 0.0019480792107060552, 'timestamp': '2025-09-04 04:04:13.820438', 'step': 2111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:13.929794', 'step': 2111, 'epoch': 2} {'type': 'loss', 'content': 0.012161768041551113, 'timestamp': '2025-09-04 04:04:13.950904', 'step': 2112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:14.049070', 'step': 2112, 'epoch': 2} {'type': 'loss', 'content': 0.048178572207689285, 'timestamp': '2025-09-04 04:04:14.069251', 'step': 2113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:14.173919', 'step': 2113, 'epoch': 2} {'type': 'loss', 'content': 0.003165569854900241, 'timestamp': '2025-09-04 04:04:14.192987', 'step': 2114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:14.297053', 'step': 2114, 'epoch': 2} {'type': 'loss', 'content': 0.015800610184669495, 'timestamp': '2025-09-04 04:04:14.316020', 'step': 2115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:14.416443', 'step': 2115, 'epoch': 2} {'type': 'loss', 'content': 0.048695262521505356, 'timestamp': '2025-09-04 04:04:14.435860', 'step': 2116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:14.535495', 'step': 2116, 'epoch': 2} {'type': 'loss', 'content': 0.002211391692981124, 'timestamp': '2025-09-04 04:04:14.556550', 'step': 2117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:04:14.666573', 'step': 2117, 'epoch': 2} {'type': 'loss', 'content': 0.006672736257314682, 'timestamp': '2025-09-04 04:04:14.687028', 'step': 2118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:04:14.803052', 'step': 2118, 'epoch': 2} {'type': 'loss', 'content': 0.010592850856482983, 'timestamp': '2025-09-04 04:04:14.825098', 'step': 2119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:14.932592', 'step': 2119, 'epoch': 2} {'type': 'loss', 'content': 0.020320625975728035, 'timestamp': '2025-09-04 04:04:14.953122', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:23.313391', 'step': 2120, 'epoch': 2} {'type': 'pplx', 'content': 312.7804928893332, 'timestamp': '2025-09-04 04:04:23.315624', 'step': 2120, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2120', 'timestamp': '2025-09-04 04:04:23.681997', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:23.781623', 'step': 2120, 'epoch': 2} {'type': 'loss', 'content': 0.010732216760516167, 'timestamp': '2025-09-04 04:04:23.802720', 'step': 2121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:04:23.889401', 'step': 2121, 'epoch': 2} {'type': 'loss', 'content': 0.005511862691491842, 'timestamp': '2025-09-04 04:04:23.905182', 'step': 2122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:23.997677', 'step': 2122, 'epoch': 2} {'type': 'loss', 'content': 0.0008789485436864197, 'timestamp': '2025-09-04 04:04:24.014839', 'step': 2123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 04:04:24.199203', 'step': 2123, 'epoch': 2} {'type': 'loss', 'content': 0.018326403573155403, 'timestamp': '2025-09-04 04:04:24.234414', 'step': 2124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:24.324777', 'step': 2124, 'epoch': 2} {'type': 'loss', 'content': 0.012555736117064953, 'timestamp': '2025-09-04 04:04:24.343404', 'step': 2125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:24.453436', 'step': 2125, 'epoch': 2} {'type': 'loss', 'content': 0.0006459427531808615, 'timestamp': '2025-09-04 04:04:24.473852', 'step': 2126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:24.575742', 'step': 2126, 'epoch': 2} {'type': 'loss', 'content': 0.0454648919403553, 'timestamp': '2025-09-04 04:04:24.594874', 'step': 2127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:04:24.680520', 'step': 2127, 'epoch': 2} {'type': 'loss', 'content': 0.10542017221450806, 'timestamp': '2025-09-04 04:04:24.696351', 'step': 2128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:04:24.782005', 'step': 2128, 'epoch': 2} {'type': 'loss', 'content': 0.014704999513924122, 'timestamp': '2025-09-04 04:04:24.799271', 'step': 2129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:24.889998', 'step': 2129, 'epoch': 2} {'type': 'loss', 'content': 0.0018945076735690236, 'timestamp': '2025-09-04 04:04:24.906873', 'step': 2130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:25.005752', 'step': 2130, 'epoch': 2} {'type': 'loss', 'content': 0.03164684399962425, 'timestamp': '2025-09-04 04:04:25.024319', 'step': 2131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:25.122975', 'step': 2131, 'epoch': 2} {'type': 'loss', 'content': 0.0005463764537125826, 'timestamp': '2025-09-04 04:04:25.142239', 'step': 2132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 04:04:25.323234', 'step': 2132, 'epoch': 2} {'type': 'loss', 'content': 0.008721102960407734, 'timestamp': '2025-09-04 04:04:25.361001', 'step': 2133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:25.459833', 'step': 2133, 'epoch': 2} {'type': 'loss', 'content': 0.008502230979502201, 'timestamp': '2025-09-04 04:04:25.478374', 'step': 2134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:04:25.555190', 'step': 2134, 'epoch': 2} {'type': 'loss', 'content': 0.06675466895103455, 'timestamp': '2025-09-04 04:04:25.569241', 'step': 2135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:25.670339', 'step': 2135, 'epoch': 2} {'type': 'loss', 'content': 0.006031819619238377, 'timestamp': '2025-09-04 04:04:25.689924', 'step': 2136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:25.788530', 'step': 2136, 'epoch': 2} {'type': 'loss', 'content': 0.005401272792369127, 'timestamp': '2025-09-04 04:04:25.809427', 'step': 2137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:25.904126', 'step': 2137, 'epoch': 2} {'type': 'loss', 'content': 0.008368422277271748, 'timestamp': '2025-09-04 04:04:25.921786', 'step': 2138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:26.024282', 'step': 2138, 'epoch': 2} {'type': 'loss', 'content': 0.004673839081078768, 'timestamp': '2025-09-04 04:04:26.043265', 'step': 2139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:26.153282', 'step': 2139, 'epoch': 2} {'type': 'loss', 'content': 0.03570368513464928, 'timestamp': '2025-09-04 04:04:26.174401', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:34.544277', 'step': 2140, 'epoch': 2} {'type': 'pplx', 'content': 310.4419842128476, 'timestamp': '2025-09-04 04:04:34.546820', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:34.636749', 'step': 2140, 'epoch': 2} {'type': 'loss', 'content': 0.0012499855365604162, 'timestamp': '2025-09-04 04:04:34.655620', 'step': 2141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:04:34.731528', 'step': 2141, 'epoch': 2} {'type': 'loss', 'content': 0.006291459314525127, 'timestamp': '2025-09-04 04:04:34.745151', 'step': 2142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:04:34.853013', 'step': 2142, 'epoch': 2} {'type': 'loss', 'content': 0.0022742494475096464, 'timestamp': '2025-09-04 04:04:34.873107', 'step': 2143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:34.984340', 'step': 2143, 'epoch': 2} {'type': 'loss', 'content': 0.01345337089151144, 'timestamp': '2025-09-04 04:04:35.005729', 'step': 2144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:35.094070', 'step': 2144, 'epoch': 2} {'type': 'loss', 'content': 0.03179781138896942, 'timestamp': '2025-09-04 04:04:35.112411', 'step': 2145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:35.212528', 'step': 2145, 'epoch': 2} {'type': 'loss', 'content': 0.004378853365778923, 'timestamp': '2025-09-04 04:04:35.231121', 'step': 2146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 04:04:35.435817', 'step': 2146, 'epoch': 2} {'type': 'loss', 'content': 0.0009116848814301193, 'timestamp': '2025-09-04 04:04:35.475134', 'step': 2147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:35.578468', 'step': 2147, 'epoch': 2} {'type': 'loss', 'content': 0.007793641183525324, 'timestamp': '2025-09-04 04:04:35.598367', 'step': 2148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:04:35.705536', 'step': 2148, 'epoch': 2} {'type': 'loss', 'content': 0.009497767314314842, 'timestamp': '2025-09-04 04:04:35.728150', 'step': 2149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:04:35.812631', 'step': 2149, 'epoch': 2} {'type': 'loss', 'content': 0.038677141070365906, 'timestamp': '2025-09-04 04:04:35.827760', 'step': 2150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:04:35.910668', 'step': 2150, 'epoch': 2} {'type': 'loss', 'content': 0.006493302993476391, 'timestamp': '2025-09-04 04:04:35.925541', 'step': 2151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:36.021059', 'step': 2151, 'epoch': 2} {'type': 'loss', 'content': 0.006713113281875849, 'timestamp': '2025-09-04 04:04:36.039260', 'step': 2152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:36.144671', 'step': 2152, 'epoch': 2} {'type': 'loss', 'content': 0.05974787473678589, 'timestamp': '2025-09-04 04:04:36.165757', 'step': 2153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:36.257893', 'step': 2153, 'epoch': 2} {'type': 'loss', 'content': 0.005305037368088961, 'timestamp': '2025-09-04 04:04:36.274415', 'step': 2154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:36.375842', 'step': 2154, 'epoch': 2} {'type': 'loss', 'content': 0.018993912264704704, 'timestamp': '2025-09-04 04:04:36.394459', 'step': 2155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:04:36.471725', 'step': 2155, 'epoch': 2} {'type': 'loss', 'content': 0.02348383143544197, 'timestamp': '2025-09-04 04:04:36.486467', 'step': 2156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:36.578110', 'step': 2156, 'epoch': 2} {'type': 'loss', 'content': 0.020909776911139488, 'timestamp': '2025-09-04 04:04:36.596903', 'step': 2157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:36.687043', 'step': 2157, 'epoch': 2} {'type': 'loss', 'content': 0.01592099852859974, 'timestamp': '2025-09-04 04:04:36.703800', 'step': 2158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:36.813038', 'step': 2158, 'epoch': 2} {'type': 'loss', 'content': 0.045487433671951294, 'timestamp': '2025-09-04 04:04:36.833248', 'step': 2159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:36.937059', 'step': 2159, 'epoch': 2} {'type': 'loss', 'content': 0.02561834827065468, 'timestamp': '2025-09-04 04:04:36.956799', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:45.360928', 'step': 2160, 'epoch': 2} {'type': 'pplx', 'content': 308.36794591931954, 'timestamp': '2025-09-04 04:04:45.362993', 'step': 2160, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2160', 'timestamp': '2025-09-04 04:04:45.876044', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:45.978684', 'step': 2160, 'epoch': 2} {'type': 'loss', 'content': 0.01050900761038065, 'timestamp': '2025-09-04 04:04:46.000386', 'step': 2161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:46.105099', 'step': 2161, 'epoch': 2} {'type': 'loss', 'content': 0.0030154536943882704, 'timestamp': '2025-09-04 04:04:46.124404', 'step': 2162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:46.227803', 'step': 2162, 'epoch': 2} {'type': 'loss', 'content': 0.019422519952058792, 'timestamp': '2025-09-04 04:04:46.247095', 'step': 2163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:46.352915', 'step': 2163, 'epoch': 2} {'type': 'loss', 'content': 0.05433286726474762, 'timestamp': '2025-09-04 04:04:46.373652', 'step': 2164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:46.464426', 'step': 2164, 'epoch': 2} {'type': 'loss', 'content': 0.020474649965763092, 'timestamp': '2025-09-04 04:04:46.483111', 'step': 2165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:46.584033', 'step': 2165, 'epoch': 2} {'type': 'loss', 'content': 0.004737554118037224, 'timestamp': '2025-09-04 04:04:46.602973', 'step': 2166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 04:04:46.733689', 'step': 2166, 'epoch': 2} {'type': 'loss', 'content': 0.0017673440743237734, 'timestamp': '2025-09-04 04:04:46.758348', 'step': 2167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:46.859429', 'step': 2167, 'epoch': 2} {'type': 'loss', 'content': 0.052274227142333984, 'timestamp': '2025-09-04 04:04:46.879084', 'step': 2168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:46.977326', 'step': 2168, 'epoch': 2} {'type': 'loss', 'content': 0.004377824254333973, 'timestamp': '2025-09-04 04:04:46.997720', 'step': 2169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:47.103598', 'step': 2169, 'epoch': 2} {'type': 'loss', 'content': 0.04207998141646385, 'timestamp': '2025-09-04 04:04:47.123611', 'step': 2170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:04:47.225813', 'step': 2170, 'epoch': 2} {'type': 'loss', 'content': 0.007436053827404976, 'timestamp': '2025-09-04 04:04:47.243190', 'step': 2171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:04:47.337262', 'step': 2171, 'epoch': 2} {'type': 'loss', 'content': 0.007250096648931503, 'timestamp': '2025-09-04 04:04:47.353490', 'step': 2172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:04:47.434923', 'step': 2172, 'epoch': 2} {'type': 'loss', 'content': 0.0015918496064841747, 'timestamp': '2025-09-04 04:04:47.451549', 'step': 2173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:47.556307', 'step': 2173, 'epoch': 2} {'type': 'loss', 'content': 0.01956302858889103, 'timestamp': '2025-09-04 04:04:47.575618', 'step': 2174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:47.681540', 'step': 2174, 'epoch': 2} {'type': 'loss', 'content': 0.04470936954021454, 'timestamp': '2025-09-04 04:04:47.701563', 'step': 2175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:47.802526', 'step': 2175, 'epoch': 2} {'type': 'loss', 'content': 0.03270625323057175, 'timestamp': '2025-09-04 04:04:47.822128', 'step': 2176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:47.930127', 'step': 2176, 'epoch': 2} {'type': 'loss', 'content': 0.0014341897331178188, 'timestamp': '2025-09-04 04:04:47.952666', 'step': 2177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:48.053251', 'step': 2177, 'epoch': 2} {'type': 'loss', 'content': 0.019161755219101906, 'timestamp': '2025-09-04 04:04:48.071882', 'step': 2178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:48.181349', 'step': 2178, 'epoch': 2} {'type': 'loss', 'content': 0.0018422268331050873, 'timestamp': '2025-09-04 04:04:48.201898', 'step': 2179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:48.295085', 'step': 2179, 'epoch': 2} {'type': 'loss', 'content': 0.017137227579951286, 'timestamp': '2025-09-04 04:04:48.312984', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:04:56.709103', 'step': 2180, 'epoch': 2} {'type': 'pplx', 'content': 308.3335132373899, 'timestamp': '2025-09-04 04:04:56.710918', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:56.799844', 'step': 2180, 'epoch': 2} {'type': 'loss', 'content': 0.013462487608194351, 'timestamp': '2025-09-04 04:04:56.818604', 'step': 2181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:04:56.915894', 'step': 2181, 'epoch': 2} {'type': 'loss', 'content': 0.015639422461390495, 'timestamp': '2025-09-04 04:04:56.933526', 'step': 2182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:57.027042', 'step': 2182, 'epoch': 2} {'type': 'loss', 'content': 0.008944478817284107, 'timestamp': '2025-09-04 04:04:57.044188', 'step': 2183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:57.147968', 'step': 2183, 'epoch': 2} {'type': 'loss', 'content': 0.037470266222953796, 'timestamp': '2025-09-04 04:04:57.168011', 'step': 2184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:57.266544', 'step': 2184, 'epoch': 2} {'type': 'loss', 'content': 0.04356255754828453, 'timestamp': '2025-09-04 04:04:57.287325', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:04:57.380483', 'step': 2185, 'epoch': 2} {'type': 'loss', 'content': 0.012546907179057598, 'timestamp': '2025-09-04 04:04:57.397427', 'step': 2186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:57.504540', 'step': 2186, 'epoch': 2} {'type': 'loss', 'content': 0.0026249419897794724, 'timestamp': '2025-09-04 04:04:57.524297', 'step': 2187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:57.628114', 'step': 2187, 'epoch': 2} {'type': 'loss', 'content': 0.007875807583332062, 'timestamp': '2025-09-04 04:04:57.647908', 'step': 2188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:04:57.755912', 'step': 2188, 'epoch': 2} {'type': 'loss', 'content': 0.020884279161691666, 'timestamp': '2025-09-04 04:04:57.778278', 'step': 2189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:57.881872', 'step': 2189, 'epoch': 2} {'type': 'loss', 'content': 0.010031616315245628, 'timestamp': '2025-09-04 04:04:57.901024', 'step': 2190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:04:57.984919', 'step': 2190, 'epoch': 2} {'type': 'loss', 'content': 0.01399738434702158, 'timestamp': '2025-09-04 04:04:58.000170', 'step': 2191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:58.099531', 'step': 2191, 'epoch': 2} {'type': 'loss', 'content': 0.008733275346457958, 'timestamp': '2025-09-04 04:04:58.118882', 'step': 2192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:04:58.221416', 'step': 2192, 'epoch': 2} {'type': 'loss', 'content': 0.03269082307815552, 'timestamp': '2025-09-04 04:04:58.241771', 'step': 2193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:04:58.343688', 'step': 2193, 'epoch': 2} {'type': 'loss', 'content': 0.03407390043139458, 'timestamp': '2025-09-04 04:04:58.362657', 'step': 2194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:58.462986', 'step': 2194, 'epoch': 2} {'type': 'loss', 'content': 0.016373533755540848, 'timestamp': '2025-09-04 04:04:58.481948', 'step': 2195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:04:58.582160', 'step': 2195, 'epoch': 2} {'type': 'loss', 'content': 0.002695757895708084, 'timestamp': '2025-09-04 04:04:58.601793', 'step': 2196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:58.702197', 'step': 2196, 'epoch': 2} {'type': 'loss', 'content': 0.03435313701629639, 'timestamp': '2025-09-04 04:04:58.723146', 'step': 2197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:04:58.829250', 'step': 2197, 'epoch': 2} {'type': 'loss', 'content': 0.011219386011362076, 'timestamp': '2025-09-04 04:04:58.849348', 'step': 2198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:04:58.940281', 'step': 2198, 'epoch': 2} {'type': 'loss', 'content': 0.018078068271279335, 'timestamp': '2025-09-04 04:04:58.957188', 'step': 2199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:04:59.062258', 'step': 2199, 'epoch': 2} {'type': 'loss', 'content': 0.028233405202627182, 'timestamp': '2025-09-04 04:04:59.082408', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:05:07.470250', 'step': 2200, 'epoch': 2} {'type': 'pplx', 'content': 312.6614110155863, 'timestamp': '2025-09-04 04:05:07.473143', 'step': 2200, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2200', 'timestamp': '2025-09-04 04:05:07.823843', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:07.923299', 'step': 2200, 'epoch': 2} {'type': 'loss', 'content': 0.016595905646681786, 'timestamp': '2025-09-04 04:05:07.944190', 'step': 2201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:08.039605', 'step': 2201, 'epoch': 2} {'type': 'loss', 'content': 0.016727754846215248, 'timestamp': '2025-09-04 04:05:08.056980', 'step': 2202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:08.149959', 'step': 2202, 'epoch': 2} {'type': 'loss', 'content': 0.008159826509654522, 'timestamp': '2025-09-04 04:05:08.166773', 'step': 2203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:05:08.254409', 'step': 2203, 'epoch': 2} {'type': 'loss', 'content': 0.016688521951436996, 'timestamp': '2025-09-04 04:05:08.270697', 'step': 2204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:08.363758', 'step': 2204, 'epoch': 2} {'type': 'loss', 'content': 0.003086843527853489, 'timestamp': '2025-09-04 04:05:08.382943', 'step': 2205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:08.487266', 'step': 2205, 'epoch': 2} {'type': 'loss', 'content': 0.003638209542259574, 'timestamp': '2025-09-04 04:05:08.506389', 'step': 2206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:08.600858', 'step': 2206, 'epoch': 2} {'type': 'loss', 'content': 0.01052644569426775, 'timestamp': '2025-09-04 04:05:08.618092', 'step': 2207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:08.722308', 'step': 2207, 'epoch': 2} {'type': 'loss', 'content': 0.022792614996433258, 'timestamp': '2025-09-04 04:05:08.742369', 'step': 2208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:08.846839', 'step': 2208, 'epoch': 2} {'type': 'loss', 'content': 0.002457141410559416, 'timestamp': '2025-09-04 04:05:08.868936', 'step': 2209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:08.959620', 'step': 2209, 'epoch': 2} {'type': 'loss', 'content': 0.06071026250720024, 'timestamp': '2025-09-04 04:05:08.976336', 'step': 2210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:09.082718', 'step': 2210, 'epoch': 2} {'type': 'loss', 'content': 0.03304049372673035, 'timestamp': '2025-09-04 04:05:09.101999', 'step': 2211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:09.205527', 'step': 2211, 'epoch': 2} {'type': 'loss', 'content': 0.00047334007103927433, 'timestamp': '2025-09-04 04:05:09.225143', 'step': 2212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:09.316744', 'step': 2212, 'epoch': 2} {'type': 'loss', 'content': 0.03290455788373947, 'timestamp': '2025-09-04 04:05:09.335668', 'step': 2213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:09.430257', 'step': 2213, 'epoch': 2} {'type': 'loss', 'content': 0.007393876556307077, 'timestamp': '2025-09-04 04:05:09.447602', 'step': 2214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:09.551318', 'step': 2214, 'epoch': 2} {'type': 'loss', 'content': 0.012073284946382046, 'timestamp': '2025-09-04 04:05:09.570585', 'step': 2215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:09.662952', 'step': 2215, 'epoch': 2} {'type': 'loss', 'content': 0.07519317418336868, 'timestamp': '2025-09-04 04:05:09.680572', 'step': 2216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:09.781402', 'step': 2216, 'epoch': 2} {'type': 'loss', 'content': 0.04182714223861694, 'timestamp': '2025-09-04 04:05:09.801878', 'step': 2217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:09.907420', 'step': 2217, 'epoch': 2} {'type': 'loss', 'content': 0.0027501049917191267, 'timestamp': '2025-09-04 04:05:09.926426', 'step': 2218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:05:10.062445', 'step': 2218, 'epoch': 2} {'type': 'loss', 'content': 0.007170806173235178, 'timestamp': '2025-09-04 04:05:10.088381', 'step': 2219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:10.184360', 'step': 2219, 'epoch': 2} {'type': 'loss', 'content': 0.01622505858540535, 'timestamp': '2025-09-04 04:05:10.202528', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:05:18.580414', 'step': 2220, 'epoch': 2} {'type': 'pplx', 'content': 317.98368699126075, 'timestamp': '2025-09-04 04:05:18.582752', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:18.682258', 'step': 2220, 'epoch': 2} {'type': 'loss', 'content': 0.0025658165104687214, 'timestamp': '2025-09-04 04:05:18.703606', 'step': 2221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:05:18.778872', 'step': 2221, 'epoch': 2} {'type': 'loss', 'content': 0.013223507441580296, 'timestamp': '2025-09-04 04:05:18.792344', 'step': 2222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:18.885172', 'step': 2222, 'epoch': 2} {'type': 'loss', 'content': 0.010641835629940033, 'timestamp': '2025-09-04 04:05:18.902241', 'step': 2223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:05:19.036671', 'step': 2223, 'epoch': 2} {'type': 'loss', 'content': 0.011351378634572029, 'timestamp': '2025-09-04 04:05:19.063427', 'step': 2224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:19.164795', 'step': 2224, 'epoch': 2} {'type': 'loss', 'content': 0.00913854967802763, 'timestamp': '2025-09-04 04:05:19.186006', 'step': 2225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:05:19.283788', 'step': 2225, 'epoch': 2} {'type': 'loss', 'content': 0.034181058406829834, 'timestamp': '2025-09-04 04:05:19.301287', 'step': 2226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:19.392275', 'step': 2226, 'epoch': 2} {'type': 'loss', 'content': 0.0016610038001090288, 'timestamp': '2025-09-04 04:05:19.409064', 'step': 2227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:05:19.485169', 'step': 2227, 'epoch': 2} {'type': 'loss', 'content': 0.02116265520453453, 'timestamp': '2025-09-04 04:05:19.499668', 'step': 2228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:05:19.601431', 'step': 2228, 'epoch': 2} {'type': 'loss', 'content': 0.006079908460378647, 'timestamp': '2025-09-04 04:05:19.621854', 'step': 2229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:05:19.731909', 'step': 2229, 'epoch': 2} {'type': 'loss', 'content': 0.0019000859465450048, 'timestamp': '2025-09-04 04:05:19.752516', 'step': 2230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:19.847153', 'step': 2230, 'epoch': 2} {'type': 'loss', 'content': 0.03679632768034935, 'timestamp': '2025-09-04 04:05:19.864479', 'step': 2231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:05:19.986974', 'step': 2231, 'epoch': 2} {'type': 'loss', 'content': 0.004042410757392645, 'timestamp': '2025-09-04 04:05:20.011031', 'step': 2232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:20.116757', 'step': 2232, 'epoch': 2} {'type': 'loss', 'content': 0.011536908335983753, 'timestamp': '2025-09-04 04:05:20.137776', 'step': 2233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 04:05:20.313896', 'step': 2233, 'epoch': 2} {'type': 'loss', 'content': 0.0038091272581368685, 'timestamp': '2025-09-04 04:05:20.346627', 'step': 2234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:20.452628', 'step': 2234, 'epoch': 2} {'type': 'loss', 'content': 0.002943043364211917, 'timestamp': '2025-09-04 04:05:20.472632', 'step': 2235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:20.579930', 'step': 2235, 'epoch': 2} {'type': 'loss', 'content': 0.009634853340685368, 'timestamp': '2025-09-04 04:05:20.600869', 'step': 2236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:20.690783', 'step': 2236, 'epoch': 2} {'type': 'loss', 'content': 0.022745907306671143, 'timestamp': '2025-09-04 04:05:20.709227', 'step': 2237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:20.818333', 'step': 2237, 'epoch': 2} {'type': 'loss', 'content': 0.003655359148979187, 'timestamp': '2025-09-04 04:05:20.838412', 'step': 2238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:05:20.930756', 'step': 2238, 'epoch': 2} {'type': 'loss', 'content': 0.016437901183962822, 'timestamp': '2025-09-04 04:05:20.946338', 'step': 2239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:21.053890', 'step': 2239, 'epoch': 2} {'type': 'loss', 'content': 0.01712539792060852, 'timestamp': '2025-09-04 04:05:21.074793', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:05:29.475438', 'step': 2240, 'epoch': 2} {'type': 'pplx', 'content': 320.6073536444029, 'timestamp': '2025-09-04 04:05:29.477692', 'step': 2240, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2240', 'timestamp': '2025-09-04 04:05:29.965605', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:30.067717', 'step': 2240, 'epoch': 2} {'type': 'loss', 'content': 0.003082787152379751, 'timestamp': '2025-09-04 04:05:30.089527', 'step': 2241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:05:30.294473', 'step': 2241, 'epoch': 2} {'type': 'loss', 'content': 0.06166600435972214, 'timestamp': '2025-09-04 04:05:30.333615', 'step': 2242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:05:30.412898', 'step': 2242, 'epoch': 2} {'type': 'loss', 'content': 0.007548161782324314, 'timestamp': '2025-09-04 04:05:30.427094', 'step': 2243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:05:30.544314', 'step': 2243, 'epoch': 2} {'type': 'loss', 'content': 0.026522303000092506, 'timestamp': '2025-09-04 04:05:30.567160', 'step': 2244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:05:30.675609', 'step': 2244, 'epoch': 2} {'type': 'loss', 'content': 0.04962928593158722, 'timestamp': '2025-09-04 04:05:30.697859', 'step': 2245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:30.789135', 'step': 2245, 'epoch': 2} {'type': 'loss', 'content': 0.001753210905008018, 'timestamp': '2025-09-04 04:05:30.805937', 'step': 2246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:05:30.891711', 'step': 2246, 'epoch': 2} {'type': 'loss', 'content': 0.0030653884168714285, 'timestamp': '2025-09-04 04:05:30.907210', 'step': 2247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:31.010986', 'step': 2247, 'epoch': 2} {'type': 'loss', 'content': 0.04342466592788696, 'timestamp': '2025-09-04 04:05:31.030783', 'step': 2248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:05:31.123299', 'step': 2248, 'epoch': 2} {'type': 'loss', 'content': 0.02544695883989334, 'timestamp': '2025-09-04 04:05:31.142550', 'step': 2249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 04:05:31.317706', 'step': 2249, 'epoch': 2} {'type': 'loss', 'content': 0.005824543070048094, 'timestamp': '2025-09-04 04:05:31.352320', 'step': 2250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:31.447443', 'step': 2250, 'epoch': 2} {'type': 'loss', 'content': 0.004376427736133337, 'timestamp': '2025-09-04 04:05:31.464644', 'step': 2251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:05:31.548999', 'step': 2251, 'epoch': 2} {'type': 'loss', 'content': 0.03957490622997284, 'timestamp': '2025-09-04 04:05:31.564806', 'step': 2252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:05:31.649806', 'step': 2252, 'epoch': 2} {'type': 'loss', 'content': 0.006183784920722246, 'timestamp': '2025-09-04 04:05:31.666917', 'step': 2253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:05:31.745329', 'step': 2253, 'epoch': 2} {'type': 'loss', 'content': 0.004066129215061665, 'timestamp': '2025-09-04 04:05:31.759132', 'step': 2254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:31.863127', 'step': 2254, 'epoch': 2} {'type': 'loss', 'content': 0.009590189903974533, 'timestamp': '2025-09-04 04:05:31.882110', 'step': 2255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:05:31.979430', 'step': 2255, 'epoch': 2} {'type': 'loss', 'content': 0.015844259411096573, 'timestamp': '2025-09-04 04:05:31.997578', 'step': 2256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:05:32.104647', 'step': 2256, 'epoch': 2} {'type': 'loss', 'content': 0.007126522250473499, 'timestamp': '2025-09-04 04:05:32.126836', 'step': 2257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:05:32.331721', 'step': 2257, 'epoch': 2} {'type': 'loss', 'content': 0.0030276242177933455, 'timestamp': '2025-09-04 04:05:32.370738', 'step': 2258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:32.476655', 'step': 2258, 'epoch': 2} {'type': 'loss', 'content': 0.01780683733522892, 'timestamp': '2025-09-04 04:05:32.495823', 'step': 2259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:05:32.596864', 'step': 2259, 'epoch': 2} {'type': 'loss', 'content': 0.0012856582179665565, 'timestamp': '2025-09-04 04:05:32.616120', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:05:41.117371', 'step': 2260, 'epoch': 2} {'type': 'pplx', 'content': 319.322025056002, 'timestamp': '2025-09-04 04:05:41.119565', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:05:41.234127', 'step': 2260, 'epoch': 2} {'type': 'loss', 'content': 0.10266165435314178, 'timestamp': '2025-09-04 04:05:41.257967', 'step': 2261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:41.361408', 'step': 2261, 'epoch': 2} {'type': 'loss', 'content': 0.013549219816923141, 'timestamp': '2025-09-04 04:05:41.380593', 'step': 2262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:41.482941', 'step': 2262, 'epoch': 2} {'type': 'loss', 'content': 0.0038013458251953125, 'timestamp': '2025-09-04 04:05:41.501959', 'step': 2263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:05:41.596910', 'step': 2263, 'epoch': 2} {'type': 'loss', 'content': 0.06860756129026413, 'timestamp': '2025-09-04 04:05:41.615157', 'step': 2264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:05:41.707053', 'step': 2264, 'epoch': 2} {'type': 'loss', 'content': 0.04129083827137947, 'timestamp': '2025-09-04 04:05:41.725972', 'step': 2265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:41.826374', 'step': 2265, 'epoch': 2} {'type': 'loss', 'content': 0.037230126559734344, 'timestamp': '2025-09-04 04:05:41.845042', 'step': 2266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:41.947523', 'step': 2266, 'epoch': 2} {'type': 'loss', 'content': 0.009212334640324116, 'timestamp': '2025-09-04 04:05:41.966655', 'step': 2267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:42.071971', 'step': 2267, 'epoch': 2} {'type': 'loss', 'content': 0.013861672952771187, 'timestamp': '2025-09-04 04:05:42.091888', 'step': 2268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:05:42.166045', 'step': 2268, 'epoch': 2} {'type': 'loss', 'content': 0.007805598899722099, 'timestamp': '2025-09-04 04:05:42.180729', 'step': 2269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:42.285975', 'step': 2269, 'epoch': 2} {'type': 'loss', 'content': 0.0023477617651224136, 'timestamp': '2025-09-04 04:05:42.305107', 'step': 2270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:42.405410', 'step': 2270, 'epoch': 2} {'type': 'loss', 'content': 0.0031649265438318253, 'timestamp': '2025-09-04 04:05:42.424225', 'step': 2271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:05:42.524728', 'step': 2271, 'epoch': 2} {'type': 'loss', 'content': 0.018990959972143173, 'timestamp': '2025-09-04 04:05:42.544057', 'step': 2272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:42.632585', 'step': 2272, 'epoch': 2} {'type': 'loss', 'content': 0.011629465036094189, 'timestamp': '2025-09-04 04:05:42.650953', 'step': 2273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:05:42.729086', 'step': 2273, 'epoch': 2} {'type': 'loss', 'content': 0.0017159185372292995, 'timestamp': '2025-09-04 04:05:42.743117', 'step': 2274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 04:05:42.957175', 'step': 2274, 'epoch': 2} {'type': 'loss', 'content': 0.012947708368301392, 'timestamp': '2025-09-04 04:05:42.997905', 'step': 2275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:05:43.092714', 'step': 2275, 'epoch': 2} {'type': 'loss', 'content': 0.03470579907298088, 'timestamp': '2025-09-04 04:05:43.110971', 'step': 2276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:05:43.187661', 'step': 2276, 'epoch': 2} {'type': 'loss', 'content': 0.005358322989195585, 'timestamp': '2025-09-04 04:05:43.202956', 'step': 2277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:43.306397', 'step': 2277, 'epoch': 2} {'type': 'loss', 'content': 0.008757129311561584, 'timestamp': '2025-09-04 04:05:43.325338', 'step': 2278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 04:05:43.453113', 'step': 2278, 'epoch': 2} {'type': 'loss', 'content': 0.001655671396292746, 'timestamp': '2025-09-04 04:05:43.477203', 'step': 2279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:43.581200', 'step': 2279, 'epoch': 2} {'type': 'loss', 'content': 0.003800937905907631, 'timestamp': '2025-09-04 04:05:43.600971', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:05:51.995079', 'step': 2280, 'epoch': 2} {'type': 'pplx', 'content': 310.31223764687115, 'timestamp': '2025-09-04 04:05:51.997642', 'step': 2280, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2280', 'timestamp': '2025-09-04 04:05:52.345100', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:05:52.420573', 'step': 2280, 'epoch': 2} {'type': 'loss', 'content': 0.0019430589163675904, 'timestamp': '2025-09-04 04:05:52.435958', 'step': 2281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:05:52.540294', 'step': 2281, 'epoch': 2} {'type': 'loss', 'content': 0.035052590072155, 'timestamp': '2025-09-04 04:05:52.559389', 'step': 2282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:52.661540', 'step': 2282, 'epoch': 2} {'type': 'loss', 'content': 0.026764625683426857, 'timestamp': '2025-09-04 04:05:52.680240', 'step': 2283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:05:52.759445', 'step': 2283, 'epoch': 2} {'type': 'loss', 'content': 0.010539744980633259, 'timestamp': '2025-09-04 04:05:52.774253', 'step': 2284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:52.872872', 'step': 2284, 'epoch': 2} {'type': 'loss', 'content': 0.03832215815782547, 'timestamp': '2025-09-04 04:05:52.893433', 'step': 2285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:05:53.002613', 'step': 2285, 'epoch': 2} {'type': 'loss', 'content': 0.005969279911369085, 'timestamp': '2025-09-04 04:05:53.022737', 'step': 2286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:53.115750', 'step': 2286, 'epoch': 2} {'type': 'loss', 'content': 0.011899287812411785, 'timestamp': '2025-09-04 04:05:53.132397', 'step': 2287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:05:53.220053', 'step': 2287, 'epoch': 2} {'type': 'loss', 'content': 0.01706940494477749, 'timestamp': '2025-09-04 04:05:53.236433', 'step': 2288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:05:53.325483', 'step': 2288, 'epoch': 2} {'type': 'loss', 'content': 0.010303936898708344, 'timestamp': '2025-09-04 04:05:53.343787', 'step': 2289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:05:53.420130', 'step': 2289, 'epoch': 2} {'type': 'loss', 'content': 0.0042968811467289925, 'timestamp': '2025-09-04 04:05:53.433846', 'step': 2290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:53.527246', 'step': 2290, 'epoch': 2} {'type': 'loss', 'content': 0.11020837724208832, 'timestamp': '2025-09-04 04:05:53.544270', 'step': 2291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:53.646159', 'step': 2291, 'epoch': 2} {'type': 'loss', 'content': 0.2083846777677536, 'timestamp': '2025-09-04 04:05:53.665376', 'step': 2292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:05:53.766145', 'step': 2292, 'epoch': 2} {'type': 'loss', 'content': 0.007627859245985746, 'timestamp': '2025-09-04 04:05:53.787078', 'step': 2293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:53.885519', 'step': 2293, 'epoch': 2} {'type': 'loss', 'content': 0.02781866304576397, 'timestamp': '2025-09-04 04:05:53.902604', 'step': 2294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:54.004580', 'step': 2294, 'epoch': 2} {'type': 'loss', 'content': 0.0024229728151112795, 'timestamp': '2025-09-04 04:05:54.023213', 'step': 2295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:05:54.131939', 'step': 2295, 'epoch': 2} {'type': 'loss', 'content': 0.005142164416611195, 'timestamp': '2025-09-04 04:05:54.152617', 'step': 2296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:54.252044', 'step': 2296, 'epoch': 2} {'type': 'loss', 'content': 0.004444428253918886, 'timestamp': '2025-09-04 04:05:54.272465', 'step': 2297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:05:54.366361', 'step': 2297, 'epoch': 2} {'type': 'loss', 'content': 0.008370397612452507, 'timestamp': '2025-09-04 04:05:54.383416', 'step': 2298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:05:54.484682', 'step': 2298, 'epoch': 2} {'type': 'loss', 'content': 0.020550237968564034, 'timestamp': '2025-09-04 04:05:54.503493', 'step': 2299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:05:54.614990', 'step': 2299, 'epoch': 2} {'type': 'loss', 'content': 0.027608707547187805, 'timestamp': '2025-09-04 04:05:54.635981', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:03.087262', 'step': 2300, 'epoch': 2} {'type': 'pplx', 'content': 302.3328329629629, 'timestamp': '2025-09-04 04:06:03.089701', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:06:03.163146', 'step': 2300, 'epoch': 2} {'type': 'loss', 'content': 0.01112450659275055, 'timestamp': '2025-09-04 04:06:03.177907', 'step': 2301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:03.280100', 'step': 2301, 'epoch': 2} {'type': 'loss', 'content': 0.007555335760116577, 'timestamp': '2025-09-04 04:06:03.298865', 'step': 2302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:06:03.384788', 'step': 2302, 'epoch': 2} {'type': 'loss', 'content': 0.024827582761645317, 'timestamp': '2025-09-04 04:06:03.400041', 'step': 2303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:06:03.493523', 'step': 2303, 'epoch': 2} {'type': 'loss', 'content': 0.006991108413785696, 'timestamp': '2025-09-04 04:06:03.511385', 'step': 2304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:03.603632', 'step': 2304, 'epoch': 2} {'type': 'loss', 'content': 0.0047009047120809555, 'timestamp': '2025-09-04 04:06:03.622610', 'step': 2305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:03.714249', 'step': 2305, 'epoch': 2} {'type': 'loss', 'content': 0.021796375513076782, 'timestamp': '2025-09-04 04:06:03.730911', 'step': 2306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:03.838853', 'step': 2306, 'epoch': 2} {'type': 'loss', 'content': 0.01497070025652647, 'timestamp': '2025-09-04 04:06:03.857863', 'step': 2307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:06:03.967870', 'step': 2307, 'epoch': 2} {'type': 'loss', 'content': 0.010811883956193924, 'timestamp': '2025-09-04 04:06:03.989029', 'step': 2308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:06:04.086437', 'step': 2308, 'epoch': 2} {'type': 'loss', 'content': 0.007137446664273739, 'timestamp': '2025-09-04 04:06:04.105083', 'step': 2309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:04.208371', 'step': 2309, 'epoch': 2} {'type': 'loss', 'content': 0.00834791362285614, 'timestamp': '2025-09-04 04:06:04.227423', 'step': 2310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:04.332171', 'step': 2310, 'epoch': 2} {'type': 'loss', 'content': 0.008670263923704624, 'timestamp': '2025-09-04 04:06:04.351382', 'step': 2311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:04.458108', 'step': 2311, 'epoch': 2} {'type': 'loss', 'content': 0.009029252454638481, 'timestamp': '2025-09-04 04:06:04.476256', 'step': 2312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:04.576767', 'step': 2312, 'epoch': 2} {'type': 'loss', 'content': 0.004705758765339851, 'timestamp': '2025-09-04 04:06:04.597508', 'step': 2313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:06:04.707664', 'step': 2313, 'epoch': 2} {'type': 'loss', 'content': 0.0046806796453893185, 'timestamp': '2025-09-04 04:06:04.727925', 'step': 2314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:04.831143', 'step': 2314, 'epoch': 2} {'type': 'loss', 'content': 0.03220272809267044, 'timestamp': '2025-09-04 04:06:04.850065', 'step': 2315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:06:04.935355', 'step': 2315, 'epoch': 2} {'type': 'loss', 'content': 0.013215369544923306, 'timestamp': '2025-09-04 04:06:04.951839', 'step': 2316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:06:05.033934', 'step': 2316, 'epoch': 2} {'type': 'loss', 'content': 0.03970480337738991, 'timestamp': '2025-09-04 04:06:05.050832', 'step': 2317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:05.140919', 'step': 2317, 'epoch': 2} {'type': 'loss', 'content': 0.0069844783283770084, 'timestamp': '2025-09-04 04:06:05.157670', 'step': 2318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:05.257832', 'step': 2318, 'epoch': 2} {'type': 'loss', 'content': 0.006692732684314251, 'timestamp': '2025-09-04 04:06:05.276581', 'step': 2319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:05.380384', 'step': 2319, 'epoch': 2} {'type': 'loss', 'content': 0.02646188624203205, 'timestamp': '2025-09-04 04:06:05.400003', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:13.772992', 'step': 2320, 'epoch': 2} {'type': 'pplx', 'content': 295.6536796467591, 'timestamp': '2025-09-04 04:06:13.775137', 'step': 2320, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2320', 'timestamp': '2025-09-04 04:06:14.126831', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:06:14.207180', 'step': 2320, 'epoch': 2} {'type': 'loss', 'content': 0.009197513572871685, 'timestamp': '2025-09-04 04:06:14.223926', 'step': 2321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:06:14.332576', 'step': 2321, 'epoch': 2} {'type': 'loss', 'content': 0.08209947496652603, 'timestamp': '2025-09-04 04:06:14.352709', 'step': 2322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:06:14.489174', 'step': 2322, 'epoch': 2} {'type': 'loss', 'content': 0.024091873317956924, 'timestamp': '2025-09-04 04:06:14.515467', 'step': 2323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:14.619946', 'step': 2323, 'epoch': 2} {'type': 'loss', 'content': 0.012969830073416233, 'timestamp': '2025-09-04 04:06:14.639869', 'step': 2324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:06:14.742599', 'step': 2324, 'epoch': 2} {'type': 'loss', 'content': 0.039127450436353683, 'timestamp': '2025-09-04 04:06:14.764493', 'step': 2325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:14.868050', 'step': 2325, 'epoch': 2} {'type': 'loss', 'content': 0.004493629559874535, 'timestamp': '2025-09-04 04:06:14.887300', 'step': 2326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:14.985953', 'step': 2326, 'epoch': 2} {'type': 'loss', 'content': 0.005800614599138498, 'timestamp': '2025-09-04 04:06:15.004519', 'step': 2327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:06:15.086700', 'step': 2327, 'epoch': 2} {'type': 'loss', 'content': 0.05111802741885185, 'timestamp': '2025-09-04 04:06:15.102670', 'step': 2328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:06:15.208703', 'step': 2328, 'epoch': 2} {'type': 'loss', 'content': 0.0008950461633503437, 'timestamp': '2025-09-04 04:06:15.231340', 'step': 2329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:06:15.325018', 'step': 2329, 'epoch': 2} {'type': 'loss', 'content': 0.0029820986092090607, 'timestamp': '2025-09-04 04:06:15.342129', 'step': 2330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:15.442186', 'step': 2330, 'epoch': 2} {'type': 'loss', 'content': 0.014196853153407574, 'timestamp': '2025-09-04 04:06:15.461081', 'step': 2331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 04:06:15.634661', 'step': 2331, 'epoch': 2} {'type': 'loss', 'content': 0.0035891346633434296, 'timestamp': '2025-09-04 04:06:15.668205', 'step': 2332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:06:15.785753', 'step': 2332, 'epoch': 2} {'type': 'loss', 'content': 0.0016923088114708662, 'timestamp': '2025-09-04 04:06:15.808308', 'step': 2333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:15.910974', 'step': 2333, 'epoch': 2} {'type': 'loss', 'content': 0.0015629819827154279, 'timestamp': '2025-09-04 04:06:15.930249', 'step': 2334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:06:16.007674', 'step': 2334, 'epoch': 2} {'type': 'loss', 'content': 0.023518525063991547, 'timestamp': '2025-09-04 04:06:16.021688', 'step': 2335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:16.124739', 'step': 2335, 'epoch': 2} {'type': 'loss', 'content': 0.0419270396232605, 'timestamp': '2025-09-04 04:06:16.144731', 'step': 2336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:16.245748', 'step': 2336, 'epoch': 2} {'type': 'loss', 'content': 0.005480342078953981, 'timestamp': '2025-09-04 04:06:16.266752', 'step': 2337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:06:16.377638', 'step': 2337, 'epoch': 2} {'type': 'loss', 'content': 0.024341512471437454, 'timestamp': '2025-09-04 04:06:16.398272', 'step': 2338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:16.499844', 'step': 2338, 'epoch': 2} {'type': 'loss', 'content': 0.017390906810760498, 'timestamp': '2025-09-04 04:06:16.518835', 'step': 2339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:16.619506', 'step': 2339, 'epoch': 2} {'type': 'loss', 'content': 0.024550272151827812, 'timestamp': '2025-09-04 04:06:16.639169', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:25.127410', 'step': 2340, 'epoch': 2} {'type': 'pplx', 'content': 295.22586601571055, 'timestamp': '2025-09-04 04:06:25.130299', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:06:25.206476', 'step': 2340, 'epoch': 2} {'type': 'loss', 'content': 0.003865597303956747, 'timestamp': '2025-09-04 04:06:25.221563', 'step': 2341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:25.325472', 'step': 2341, 'epoch': 2} {'type': 'loss', 'content': 0.023083921521902084, 'timestamp': '2025-09-04 04:06:25.344427', 'step': 2342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 04:06:25.474558', 'step': 2342, 'epoch': 2} {'type': 'loss', 'content': 0.0006907058414071798, 'timestamp': '2025-09-04 04:06:25.497944', 'step': 2343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:25.591178', 'step': 2343, 'epoch': 2} {'type': 'loss', 'content': 0.01256527565419674, 'timestamp': '2025-09-04 04:06:25.608450', 'step': 2344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1008], 'flops': 20160122450880.0}, 'timestamp': '2025-09-04 04:06:25.751962', 'step': 2344, 'epoch': 2} {'type': 'loss', 'content': 0.0006791522027924657, 'timestamp': '2025-09-04 04:06:25.782760', 'step': 2345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:25.888362', 'step': 2345, 'epoch': 2} {'type': 'loss', 'content': 0.006552206818014383, 'timestamp': '2025-09-04 04:06:25.907296', 'step': 2346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:06:26.018399', 'step': 2346, 'epoch': 2} {'type': 'loss', 'content': 0.009997514076530933, 'timestamp': '2025-09-04 04:06:26.038487', 'step': 2347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:26.138952', 'step': 2347, 'epoch': 2} {'type': 'loss', 'content': 0.028129128739237785, 'timestamp': '2025-09-04 04:06:26.158067', 'step': 2348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:26.260093', 'step': 2348, 'epoch': 2} {'type': 'loss', 'content': 0.0038631020579487085, 'timestamp': '2025-09-04 04:06:26.281059', 'step': 2349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:26.385515', 'step': 2349, 'epoch': 2} {'type': 'loss', 'content': 0.00890912301838398, 'timestamp': '2025-09-04 04:06:26.404418', 'step': 2350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:06:26.487101', 'step': 2350, 'epoch': 2} {'type': 'loss', 'content': 0.0030983267351984978, 'timestamp': '2025-09-04 04:06:26.500988', 'step': 2351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:26.592798', 'step': 2351, 'epoch': 2} {'type': 'loss', 'content': 0.04077935963869095, 'timestamp': '2025-09-04 04:06:26.610095', 'step': 2352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:06:26.726858', 'step': 2352, 'epoch': 2} {'type': 'loss', 'content': 0.0391593798995018, 'timestamp': '2025-09-04 04:06:26.750833', 'step': 2353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:26.846116', 'step': 2353, 'epoch': 2} {'type': 'loss', 'content': 0.0036758643109351397, 'timestamp': '2025-09-04 04:06:26.863271', 'step': 2354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:06:26.940414', 'step': 2354, 'epoch': 2} {'type': 'loss', 'content': 0.0077694314531981945, 'timestamp': '2025-09-04 04:06:26.953721', 'step': 2355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:27.057469', 'step': 2355, 'epoch': 2} {'type': 'loss', 'content': 0.035263847559690475, 'timestamp': '2025-09-04 04:06:27.077180', 'step': 2356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:27.176464', 'step': 2356, 'epoch': 2} {'type': 'loss', 'content': 0.0017548573669046164, 'timestamp': '2025-09-04 04:06:27.196622', 'step': 2357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:06:27.333584', 'step': 2357, 'epoch': 2} {'type': 'loss', 'content': 0.0036123136524111032, 'timestamp': '2025-09-04 04:06:27.359290', 'step': 2358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:27.462130', 'step': 2358, 'epoch': 2} {'type': 'loss', 'content': 0.026867439970374107, 'timestamp': '2025-09-04 04:06:27.480803', 'step': 2359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:06:27.571768', 'step': 2359, 'epoch': 2} {'type': 'loss', 'content': 0.011943703517317772, 'timestamp': '2025-09-04 04:06:27.587944', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:36.064337', 'step': 2360, 'epoch': 2} {'type': 'pplx', 'content': 298.06816808696493, 'timestamp': '2025-09-04 04:06:36.066522', 'step': 2360, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2360', 'timestamp': '2025-09-04 04:06:36.409656', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:36.509058', 'step': 2360, 'epoch': 2} {'type': 'loss', 'content': 0.01579303853213787, 'timestamp': '2025-09-04 04:06:36.530166', 'step': 2361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:06:36.608506', 'step': 2361, 'epoch': 2} {'type': 'loss', 'content': 0.0023343523498624563, 'timestamp': '2025-09-04 04:06:36.622543', 'step': 2362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:06:36.700242', 'step': 2362, 'epoch': 2} {'type': 'loss', 'content': 0.030650924891233444, 'timestamp': '2025-09-04 04:06:36.714227', 'step': 2363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:06:36.824118', 'step': 2363, 'epoch': 2} {'type': 'loss', 'content': 0.0008962932624854147, 'timestamp': '2025-09-04 04:06:36.845479', 'step': 2364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:06:36.922413', 'step': 2364, 'epoch': 2} {'type': 'loss', 'content': 0.008728813380002975, 'timestamp': '2025-09-04 04:06:36.937914', 'step': 2365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:37.041318', 'step': 2365, 'epoch': 2} {'type': 'loss', 'content': 0.0025963473599404097, 'timestamp': '2025-09-04 04:06:37.060611', 'step': 2366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:06:37.167037', 'step': 2366, 'epoch': 2} {'type': 'loss', 'content': 0.006974723190069199, 'timestamp': '2025-09-04 04:06:37.187013', 'step': 2367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:37.291474', 'step': 2367, 'epoch': 2} {'type': 'loss', 'content': 0.026150088757276535, 'timestamp': '2025-09-04 04:06:37.311513', 'step': 2368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:06:37.416575', 'step': 2368, 'epoch': 2} {'type': 'loss', 'content': 0.033544786274433136, 'timestamp': '2025-09-04 04:06:37.438548', 'step': 2369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:37.530032', 'step': 2369, 'epoch': 2} {'type': 'loss', 'content': 0.0012710822047665715, 'timestamp': '2025-09-04 04:06:37.546779', 'step': 2370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:37.640113', 'step': 2370, 'epoch': 2} {'type': 'loss', 'content': 0.004774425644427538, 'timestamp': '2025-09-04 04:06:37.657431', 'step': 2371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:37.748688', 'step': 2371, 'epoch': 2} {'type': 'loss', 'content': 0.010192320682108402, 'timestamp': '2025-09-04 04:06:37.766189', 'step': 2372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:06:37.857984', 'step': 2372, 'epoch': 2} {'type': 'loss', 'content': 0.047795332968235016, 'timestamp': '2025-09-04 04:06:37.877161', 'step': 2373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:06:37.960434', 'step': 2373, 'epoch': 2} {'type': 'loss', 'content': 0.018767505884170532, 'timestamp': '2025-09-04 04:06:37.975653', 'step': 2374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:38.073920', 'step': 2374, 'epoch': 2} {'type': 'loss', 'content': 0.017257632687687874, 'timestamp': '2025-09-04 04:06:38.092428', 'step': 2375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:06:38.194437', 'step': 2375, 'epoch': 2} {'type': 'loss', 'content': 0.007572493981570005, 'timestamp': '2025-09-04 04:06:38.214336', 'step': 2376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:38.313762', 'step': 2376, 'epoch': 2} {'type': 'loss', 'content': 0.0060096923261880875, 'timestamp': '2025-09-04 04:06:38.334427', 'step': 2377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:06:38.421155', 'step': 2377, 'epoch': 2} {'type': 'loss', 'content': 0.02415098063647747, 'timestamp': '2025-09-04 04:06:38.436761', 'step': 2378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:38.530537', 'step': 2378, 'epoch': 2} {'type': 'loss', 'content': 0.016814231872558594, 'timestamp': '2025-09-04 04:06:38.547930', 'step': 2379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:38.651752', 'step': 2379, 'epoch': 2} {'type': 'loss', 'content': 0.0010228022001683712, 'timestamp': '2025-09-04 04:06:38.671811', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:47.058540', 'step': 2380, 'epoch': 2} {'type': 'pplx', 'content': 300.2741385956129, 'timestamp': '2025-09-04 04:06:47.060601', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:47.156440', 'step': 2380, 'epoch': 2} {'type': 'loss', 'content': 0.05420851334929466, 'timestamp': '2025-09-04 04:06:47.177116', 'step': 2381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:06:47.282495', 'step': 2381, 'epoch': 2} {'type': 'loss', 'content': 0.013719238340854645, 'timestamp': '2025-09-04 04:06:47.302180', 'step': 2382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:47.405329', 'step': 2382, 'epoch': 2} {'type': 'loss', 'content': 0.01382434368133545, 'timestamp': '2025-09-04 04:06:47.424573', 'step': 2383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1424], 'flops': 28480172958272.0}, 'timestamp': '2025-09-04 04:06:47.635091', 'step': 2383, 'epoch': 2} {'type': 'loss', 'content': 0.017751427367329597, 'timestamp': '2025-09-04 04:06:47.676429', 'step': 2384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:47.773764', 'step': 2384, 'epoch': 2} {'type': 'loss', 'content': 0.026963358744978905, 'timestamp': '2025-09-04 04:06:47.794292', 'step': 2385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:47.884251', 'step': 2385, 'epoch': 2} {'type': 'loss', 'content': 0.0162715595215559, 'timestamp': '2025-09-04 04:06:47.901019', 'step': 2386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:47.994726', 'step': 2386, 'epoch': 2} {'type': 'loss', 'content': 0.009239349514245987, 'timestamp': '2025-09-04 04:06:48.012230', 'step': 2387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:48.114990', 'step': 2387, 'epoch': 2} {'type': 'loss', 'content': 0.02234196476638317, 'timestamp': '2025-09-04 04:06:48.135143', 'step': 2388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:06:48.216079', 'step': 2388, 'epoch': 2} {'type': 'loss', 'content': 0.031179826706647873, 'timestamp': '2025-09-04 04:06:48.232525', 'step': 2389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:06:48.340364', 'step': 2389, 'epoch': 2} {'type': 'loss', 'content': 0.025329116731882095, 'timestamp': '2025-09-04 04:06:48.360594', 'step': 2390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:48.461110', 'step': 2390, 'epoch': 2} {'type': 'loss', 'content': 0.022294968366622925, 'timestamp': '2025-09-04 04:06:48.480072', 'step': 2391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:06:48.576529', 'step': 2391, 'epoch': 2} {'type': 'loss', 'content': 0.0785910040140152, 'timestamp': '2025-09-04 04:06:48.594808', 'step': 2392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:06:48.668726', 'step': 2392, 'epoch': 2} {'type': 'loss', 'content': 0.024669643491506577, 'timestamp': '2025-09-04 04:06:48.683548', 'step': 2393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:06:48.782655', 'step': 2393, 'epoch': 2} {'type': 'loss', 'content': 0.023527929559350014, 'timestamp': '2025-09-04 04:06:48.801350', 'step': 2394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:06:48.917923', 'step': 2394, 'epoch': 2} {'type': 'loss', 'content': 0.01429518312215805, 'timestamp': '2025-09-04 04:06:48.940049', 'step': 2395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:49.043345', 'step': 2395, 'epoch': 2} {'type': 'loss', 'content': 0.003827124135568738, 'timestamp': '2025-09-04 04:06:49.063366', 'step': 2396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:06:49.204787', 'step': 2396, 'epoch': 2} {'type': 'loss', 'content': 0.010013996623456478, 'timestamp': '2025-09-04 04:06:49.235597', 'step': 2397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:06:49.343503', 'step': 2397, 'epoch': 2} {'type': 'loss', 'content': 0.011067106388509274, 'timestamp': '2025-09-04 04:06:49.363807', 'step': 2398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:06:49.460343', 'step': 2398, 'epoch': 2} {'type': 'loss', 'content': 0.000826965959277004, 'timestamp': '2025-09-04 04:06:49.477861', 'step': 2399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:06:49.572217', 'step': 2399, 'epoch': 2} {'type': 'loss', 'content': 0.03683660551905632, 'timestamp': '2025-09-04 04:06:49.590419', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:06:57.987370', 'step': 2400, 'epoch': 2} {'type': 'pplx', 'content': 299.87191027633986, 'timestamp': '2025-09-04 04:06:57.989481', 'step': 2400, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2400', 'timestamp': '2025-09-04 04:06:58.497954', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:58.598527', 'step': 2400, 'epoch': 2} {'type': 'loss', 'content': 0.0408879779279232, 'timestamp': '2025-09-04 04:06:58.619076', 'step': 2401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:06:58.763688', 'step': 2401, 'epoch': 2} {'type': 'loss', 'content': 0.07916318625211716, 'timestamp': '2025-09-04 04:06:58.783853', 'step': 2402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:06:58.913911', 'step': 2402, 'epoch': 2} {'type': 'loss', 'content': 0.009972754865884781, 'timestamp': '2025-09-04 04:06:58.933370', 'step': 2403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:06:59.098106', 'step': 2403, 'epoch': 2} {'type': 'loss', 'content': 0.005085110664367676, 'timestamp': '2025-09-04 04:06:59.120132', 'step': 2404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:06:59.303364', 'step': 2404, 'epoch': 2} {'type': 'loss', 'content': 0.006083235610276461, 'timestamp': '2025-09-04 04:06:59.325785', 'step': 2405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:06:59.433171', 'step': 2405, 'epoch': 2} {'type': 'loss', 'content': 0.018005046993494034, 'timestamp': '2025-09-04 04:06:59.450765', 'step': 2406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:06:59.565299', 'step': 2406, 'epoch': 2} {'type': 'loss', 'content': 0.0005014762282371521, 'timestamp': '2025-09-04 04:06:59.584616', 'step': 2407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:06:59.661630', 'step': 2407, 'epoch': 2} {'type': 'loss', 'content': 0.008235539309680462, 'timestamp': '2025-09-04 04:06:59.675910', 'step': 2408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:06:59.818065', 'step': 2408, 'epoch': 2} {'type': 'loss', 'content': 0.010737213306128979, 'timestamp': '2025-09-04 04:06:59.837867', 'step': 2409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:00.009283', 'step': 2409, 'epoch': 2} {'type': 'loss', 'content': 0.03348386660218239, 'timestamp': '2025-09-04 04:07:00.028112', 'step': 2410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:00.148227', 'step': 2410, 'epoch': 2} {'type': 'loss', 'content': 0.003203654196113348, 'timestamp': '2025-09-04 04:07:00.167426', 'step': 2411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:00.280154', 'step': 2411, 'epoch': 2} {'type': 'loss', 'content': 0.012616422958672047, 'timestamp': '2025-09-04 04:07:00.301563', 'step': 2412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:07:00.386966', 'step': 2412, 'epoch': 2} {'type': 'loss', 'content': 0.011091896332800388, 'timestamp': '2025-09-04 04:07:00.403586', 'step': 2413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 04:07:00.586895', 'step': 2413, 'epoch': 2} {'type': 'loss', 'content': 0.012619656510651112, 'timestamp': '2025-09-04 04:07:00.620884', 'step': 2414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:00.732799', 'step': 2414, 'epoch': 2} {'type': 'loss', 'content': 0.02890246920287609, 'timestamp': '2025-09-04 04:07:00.751815', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:00.890554', 'step': 2415, 'epoch': 2} {'type': 'loss', 'content': 0.023879971355199814, 'timestamp': '2025-09-04 04:07:00.911487', 'step': 2416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:01.035963', 'step': 2416, 'epoch': 2} {'type': 'loss', 'content': 0.009976472705602646, 'timestamp': '2025-09-04 04:07:01.056672', 'step': 2417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:07:01.187998', 'step': 2417, 'epoch': 2} {'type': 'loss', 'content': 0.038926564157009125, 'timestamp': '2025-09-04 04:07:01.201531', 'step': 2418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:07:01.314823', 'step': 2418, 'epoch': 2} {'type': 'loss', 'content': 0.01832580380141735, 'timestamp': '2025-09-04 04:07:01.332331', 'step': 2419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:01.421325', 'step': 2419, 'epoch': 2} {'type': 'loss', 'content': 0.00956509169191122, 'timestamp': '2025-09-04 04:07:01.437538', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:07:09.904505', 'step': 2420, 'epoch': 2} {'type': 'pplx', 'content': 303.06925253168487, 'timestamp': '2025-09-04 04:07:09.907076', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:07:09.979738', 'step': 2420, 'epoch': 2} {'type': 'loss', 'content': 0.037451133131980896, 'timestamp': '2025-09-04 04:07:09.994374', 'step': 2421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:07:10.089818', 'step': 2421, 'epoch': 2} {'type': 'loss', 'content': 0.013809522613883018, 'timestamp': '2025-09-04 04:07:10.107208', 'step': 2422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:10.209562', 'step': 2422, 'epoch': 2} {'type': 'loss', 'content': 0.03779786825180054, 'timestamp': '2025-09-04 04:07:10.228790', 'step': 2423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:10.316057', 'step': 2423, 'epoch': 2} {'type': 'loss', 'content': 0.013304756954312325, 'timestamp': '2025-09-04 04:07:10.332431', 'step': 2424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1232], 'flops': 24640149647168.0}, 'timestamp': '2025-09-04 04:07:10.511846', 'step': 2424, 'epoch': 2} {'type': 'loss', 'content': 0.04832831397652626, 'timestamp': '2025-09-04 04:07:10.549441', 'step': 2425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:10.636716', 'step': 2425, 'epoch': 2} {'type': 'loss', 'content': 0.00974750891327858, 'timestamp': '2025-09-04 04:07:10.652143', 'step': 2426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:10.756272', 'step': 2426, 'epoch': 2} {'type': 'loss', 'content': 0.04315938055515289, 'timestamp': '2025-09-04 04:07:10.775526', 'step': 2427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:10.874657', 'step': 2427, 'epoch': 2} {'type': 'loss', 'content': 0.00986342690885067, 'timestamp': '2025-09-04 04:07:10.894019', 'step': 2428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:07:11.028846', 'step': 2428, 'epoch': 2} {'type': 'loss', 'content': 0.0031775757670402527, 'timestamp': '2025-09-04 04:07:11.057582', 'step': 2429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:11.143954', 'step': 2429, 'epoch': 2} {'type': 'loss', 'content': 0.024582451209425926, 'timestamp': '2025-09-04 04:07:11.159488', 'step': 2430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:07:11.269272', 'step': 2430, 'epoch': 2} {'type': 'loss', 'content': 0.09256591647863388, 'timestamp': '2025-09-04 04:07:11.289807', 'step': 2431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:11.391712', 'step': 2431, 'epoch': 2} {'type': 'loss', 'content': 0.014577627182006836, 'timestamp': '2025-09-04 04:07:11.411644', 'step': 2432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:07:11.501022', 'step': 2432, 'epoch': 2} {'type': 'loss', 'content': 0.0009501639287918806, 'timestamp': '2025-09-04 04:07:11.519394', 'step': 2433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:11.622587', 'step': 2433, 'epoch': 2} {'type': 'loss', 'content': 0.022495364770293236, 'timestamp': '2025-09-04 04:07:11.641457', 'step': 2434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:07:11.718910', 'step': 2434, 'epoch': 2} {'type': 'loss', 'content': 0.013360848650336266, 'timestamp': '2025-09-04 04:07:11.732852', 'step': 2435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:11.818320', 'step': 2435, 'epoch': 2} {'type': 'loss', 'content': 0.006631570402532816, 'timestamp': '2025-09-04 04:07:11.834542', 'step': 2436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:11.933817', 'step': 2436, 'epoch': 2} {'type': 'loss', 'content': 0.004165567457675934, 'timestamp': '2025-09-04 04:07:11.954489', 'step': 2437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:12.059057', 'step': 2437, 'epoch': 2} {'type': 'loss', 'content': 0.002891003619879484, 'timestamp': '2025-09-04 04:07:12.078289', 'step': 2438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:12.177201', 'step': 2438, 'epoch': 2} {'type': 'loss', 'content': 0.00025852315593510866, 'timestamp': '2025-09-04 04:07:12.195858', 'step': 2439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 04:07:12.333048', 'step': 2439, 'epoch': 2} {'type': 'loss', 'content': 0.0613558404147625, 'timestamp': '2025-09-04 04:07:12.360058', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:07:20.744500', 'step': 2440, 'epoch': 2} {'type': 'pplx', 'content': 310.24100791067593, 'timestamp': '2025-09-04 04:07:20.746658', 'step': 2440, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2440', 'timestamp': '2025-09-04 04:07:21.256656', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:07:21.373924', 'step': 2440, 'epoch': 2} {'type': 'loss', 'content': 0.0030447612516582012, 'timestamp': '2025-09-04 04:07:21.399154', 'step': 2441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:07:21.602494', 'step': 2441, 'epoch': 2} {'type': 'loss', 'content': 0.028010720387101173, 'timestamp': '2025-09-04 04:07:21.641816', 'step': 2442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:21.751643', 'step': 2442, 'epoch': 2} {'type': 'loss', 'content': 0.00379212130792439, 'timestamp': '2025-09-04 04:07:21.772269', 'step': 2443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:07:21.866937', 'step': 2443, 'epoch': 2} {'type': 'loss', 'content': 0.0061768838204443455, 'timestamp': '2025-09-04 04:07:21.885100', 'step': 2444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:07:21.975206', 'step': 2444, 'epoch': 2} {'type': 'loss', 'content': 0.011033882386982441, 'timestamp': '2025-09-04 04:07:21.994027', 'step': 2445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:07:22.104069', 'step': 2445, 'epoch': 2} {'type': 'loss', 'content': 0.01739361509680748, 'timestamp': '2025-09-04 04:07:22.124669', 'step': 2446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:22.234369', 'step': 2446, 'epoch': 2} {'type': 'loss', 'content': 0.020273465663194656, 'timestamp': '2025-09-04 04:07:22.255184', 'step': 2447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:22.357414', 'step': 2447, 'epoch': 2} {'type': 'loss', 'content': 0.021363025531172752, 'timestamp': '2025-09-04 04:07:22.377554', 'step': 2448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:07:22.470270', 'step': 2448, 'epoch': 2} {'type': 'loss', 'content': 0.012352973222732544, 'timestamp': '2025-09-04 04:07:22.489610', 'step': 2449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:22.574849', 'step': 2449, 'epoch': 2} {'type': 'loss', 'content': 0.009310700930655003, 'timestamp': '2025-09-04 04:07:22.590465', 'step': 2450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:07:22.681235', 'step': 2450, 'epoch': 2} {'type': 'loss', 'content': 0.012666204944252968, 'timestamp': '2025-09-04 04:07:22.698051', 'step': 2451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:22.782113', 'step': 2451, 'epoch': 2} {'type': 'loss', 'content': 0.002646137960255146, 'timestamp': '2025-09-04 04:07:22.798362', 'step': 2452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:07:22.889307', 'step': 2452, 'epoch': 2} {'type': 'loss', 'content': 0.03072880022227764, 'timestamp': '2025-09-04 04:07:22.908543', 'step': 2453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:23.012169', 'step': 2453, 'epoch': 2} {'type': 'loss', 'content': 0.01461123675107956, 'timestamp': '2025-09-04 04:07:23.031423', 'step': 2454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:23.143359', 'step': 2454, 'epoch': 2} {'type': 'loss', 'content': 0.010309387929737568, 'timestamp': '2025-09-04 04:07:23.164090', 'step': 2455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:23.267195', 'step': 2455, 'epoch': 2} {'type': 'loss', 'content': 0.07364504784345627, 'timestamp': '2025-09-04 04:07:23.287265', 'step': 2456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:23.371287', 'step': 2456, 'epoch': 2} {'type': 'loss', 'content': 0.028103487566113472, 'timestamp': '2025-09-04 04:07:23.388274', 'step': 2457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:07:23.516938', 'step': 2457, 'epoch': 2} {'type': 'loss', 'content': 0.03834032267332077, 'timestamp': '2025-09-04 04:07:23.540149', 'step': 2458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:23.651140', 'step': 2458, 'epoch': 2} {'type': 'loss', 'content': 0.0072427773848176, 'timestamp': '2025-09-04 04:07:23.671797', 'step': 2459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:23.770961', 'step': 2459, 'epoch': 2} {'type': 'loss', 'content': 0.0034062564373016357, 'timestamp': '2025-09-04 04:07:23.790327', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:07:32.212762', 'step': 2460, 'epoch': 2} {'type': 'pplx', 'content': 315.36553546678925, 'timestamp': '2025-09-04 04:07:32.215085', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:07:32.320973', 'step': 2460, 'epoch': 2} {'type': 'loss', 'content': 0.0368022620677948, 'timestamp': '2025-09-04 04:07:32.343546', 'step': 2461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:32.446908', 'step': 2461, 'epoch': 2} {'type': 'loss', 'content': 0.014917043037712574, 'timestamp': '2025-09-04 04:07:32.466067', 'step': 2462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:07:32.543207', 'step': 2462, 'epoch': 2} {'type': 'loss', 'content': 0.07748901844024658, 'timestamp': '2025-09-04 04:07:32.557180', 'step': 2463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:32.661303', 'step': 2463, 'epoch': 2} {'type': 'loss', 'content': 0.006906221155077219, 'timestamp': '2025-09-04 04:07:32.681071', 'step': 2464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:32.777954', 'step': 2464, 'epoch': 2} {'type': 'loss', 'content': 0.03316435217857361, 'timestamp': '2025-09-04 04:07:32.798213', 'step': 2465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:07:32.892126', 'step': 2465, 'epoch': 2} {'type': 'loss', 'content': 0.0017650446388870478, 'timestamp': '2025-09-04 04:07:32.909510', 'step': 2466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:07:33.055191', 'step': 2466, 'epoch': 2} {'type': 'loss', 'content': 0.010998114012181759, 'timestamp': '2025-09-04 04:07:33.083267', 'step': 2467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:07:33.153289', 'step': 2467, 'epoch': 2} {'type': 'loss', 'content': 0.00827596615999937, 'timestamp': '2025-09-04 04:07:33.166670', 'step': 2468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:33.267008', 'step': 2468, 'epoch': 2} {'type': 'loss', 'content': 0.010986930690705776, 'timestamp': '2025-09-04 04:07:33.288172', 'step': 2469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:07:33.398361', 'step': 2469, 'epoch': 2} {'type': 'loss', 'content': 0.002616006415337324, 'timestamp': '2025-09-04 04:07:33.418487', 'step': 2470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 04:07:33.581804', 'step': 2470, 'epoch': 2} {'type': 'loss', 'content': 0.010370907373726368, 'timestamp': '2025-09-04 04:07:33.613941', 'step': 2471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:07:33.700684', 'step': 2471, 'epoch': 2} {'type': 'loss', 'content': 0.018909158185124397, 'timestamp': '2025-09-04 04:07:33.716679', 'step': 2472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:33.821067', 'step': 2472, 'epoch': 2} {'type': 'loss', 'content': 0.03325289860367775, 'timestamp': '2025-09-04 04:07:33.842023', 'step': 2473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:07:33.912109', 'step': 2473, 'epoch': 2} {'type': 'loss', 'content': 0.0038535459898412228, 'timestamp': '2025-09-04 04:07:33.924635', 'step': 2474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:34.035300', 'step': 2474, 'epoch': 2} {'type': 'loss', 'content': 0.03316061198711395, 'timestamp': '2025-09-04 04:07:34.055652', 'step': 2475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:34.148736', 'step': 2475, 'epoch': 2} {'type': 'loss', 'content': 0.042802825570106506, 'timestamp': '2025-09-04 04:07:34.164952', 'step': 2476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:34.265241', 'step': 2476, 'epoch': 2} {'type': 'loss', 'content': 0.013982797972857952, 'timestamp': '2025-09-04 04:07:34.286061', 'step': 2477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:34.388236', 'step': 2477, 'epoch': 2} {'type': 'loss', 'content': 0.00205409643240273, 'timestamp': '2025-09-04 04:07:34.406622', 'step': 2478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:07:34.489076', 'step': 2478, 'epoch': 2} {'type': 'loss', 'content': 0.0019991924054920673, 'timestamp': '2025-09-04 04:07:34.503233', 'step': 2479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:07:34.612716', 'step': 2479, 'epoch': 2} {'type': 'loss', 'content': 0.03043895773589611, 'timestamp': '2025-09-04 04:07:34.633505', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:07:43.012781', 'step': 2480, 'epoch': 2} {'type': 'pplx', 'content': 318.30207618622006, 'timestamp': '2025-09-04 04:07:43.014765', 'step': 2480, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2480', 'timestamp': '2025-09-04 04:07:43.373304', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:43.456246', 'step': 2480, 'epoch': 2} {'type': 'loss', 'content': 0.023624232038855553, 'timestamp': '2025-09-04 04:07:43.473367', 'step': 2481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:07:43.567295', 'step': 2481, 'epoch': 2} {'type': 'loss', 'content': 0.028039980679750443, 'timestamp': '2025-09-04 04:07:43.584815', 'step': 2482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:07:43.691965', 'step': 2482, 'epoch': 2} {'type': 'loss', 'content': 0.04251798987388611, 'timestamp': '2025-09-04 04:07:43.712226', 'step': 2483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:07:43.788241', 'step': 2483, 'epoch': 2} {'type': 'loss', 'content': 0.013368581421673298, 'timestamp': '2025-09-04 04:07:43.802722', 'step': 2484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:07:43.906725', 'step': 2484, 'epoch': 2} {'type': 'loss', 'content': 0.00481030810624361, 'timestamp': '2025-09-04 04:07:43.928625', 'step': 2485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:44.039992', 'step': 2485, 'epoch': 2} {'type': 'loss', 'content': 0.03684372827410698, 'timestamp': '2025-09-04 04:07:44.060654', 'step': 2486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:07:44.153521', 'step': 2486, 'epoch': 2} {'type': 'loss', 'content': 0.005444700364023447, 'timestamp': '2025-09-04 04:07:44.170668', 'step': 2487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:44.270187', 'step': 2487, 'epoch': 2} {'type': 'loss', 'content': 0.0024230824783444405, 'timestamp': '2025-09-04 04:07:44.289865', 'step': 2488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1088], 'flops': 21760132163840.0}, 'timestamp': '2025-09-04 04:07:44.443208', 'step': 2488, 'epoch': 2} {'type': 'loss', 'content': 0.026805497705936432, 'timestamp': '2025-09-04 04:07:44.476712', 'step': 2489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:44.579744', 'step': 2489, 'epoch': 2} {'type': 'loss', 'content': 0.02435290813446045, 'timestamp': '2025-09-04 04:07:44.599073', 'step': 2490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:07:44.692908', 'step': 2490, 'epoch': 2} {'type': 'loss', 'content': 0.0021042758598923683, 'timestamp': '2025-09-04 04:07:44.710316', 'step': 2491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:07:44.819017', 'step': 2491, 'epoch': 2} {'type': 'loss', 'content': 0.005590126849710941, 'timestamp': '2025-09-04 04:07:44.840113', 'step': 2492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:07:44.923546', 'step': 2492, 'epoch': 2} {'type': 'loss', 'content': 0.008740575052797794, 'timestamp': '2025-09-04 04:07:44.940517', 'step': 2493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:45.026735', 'step': 2493, 'epoch': 2} {'type': 'loss', 'content': 0.024248680099844933, 'timestamp': '2025-09-04 04:07:45.042313', 'step': 2494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:45.142261', 'step': 2494, 'epoch': 2} {'type': 'loss', 'content': 0.028292085975408554, 'timestamp': '2025-09-04 04:07:45.161082', 'step': 2495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:07:45.255265', 'step': 2495, 'epoch': 2} {'type': 'loss', 'content': 0.014871833845973015, 'timestamp': '2025-09-04 04:07:45.273507', 'step': 2496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:45.374285', 'step': 2496, 'epoch': 2} {'type': 'loss', 'content': 0.017097758129239082, 'timestamp': '2025-09-04 04:07:45.395462', 'step': 2497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:07:45.515770', 'step': 2497, 'epoch': 2} {'type': 'loss', 'content': 0.014303861185908318, 'timestamp': '2025-09-04 04:07:45.537472', 'step': 2498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:07:45.628732', 'step': 2498, 'epoch': 2} {'type': 'loss', 'content': 0.0032022215891629457, 'timestamp': '2025-09-04 04:07:45.645560', 'step': 2499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:07:45.740982', 'step': 2499, 'epoch': 2} {'type': 'loss', 'content': 0.0452834852039814, 'timestamp': '2025-09-04 04:07:45.759178', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:07:54.128547', 'step': 2500, 'epoch': 2} {'type': 'pplx', 'content': 319.347960568569, 'timestamp': '2025-09-04 04:07:54.130884', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 04:07:54.190519', 'step': 2500, 'epoch': 2} {'type': 'loss', 'content': 0.007545833010226488, 'timestamp': '2025-09-04 04:07:54.202250', 'step': 2501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:07:54.284283', 'step': 2501, 'epoch': 2} {'type': 'loss', 'content': 0.008382921107113361, 'timestamp': '2025-09-04 04:07:54.299354', 'step': 2502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 04:07:54.428234', 'step': 2502, 'epoch': 2} {'type': 'loss', 'content': 0.010973788797855377, 'timestamp': '2025-09-04 04:07:54.452789', 'step': 2503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:54.538660', 'step': 2503, 'epoch': 2} {'type': 'loss', 'content': 0.04710651561617851, 'timestamp': '2025-09-04 04:07:54.555139', 'step': 2504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:54.661887', 'step': 2504, 'epoch': 2} {'type': 'loss', 'content': 0.00142197054810822, 'timestamp': '2025-09-04 04:07:54.684651', 'step': 2505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:07:54.794898', 'step': 2505, 'epoch': 2} {'type': 'loss', 'content': 0.04192418232560158, 'timestamp': '2025-09-04 04:07:54.815339', 'step': 2506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:54.918276', 'step': 2506, 'epoch': 2} {'type': 'loss', 'content': 0.024896910414099693, 'timestamp': '2025-09-04 04:07:54.937467', 'step': 2507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:55.038731', 'step': 2507, 'epoch': 2} {'type': 'loss', 'content': 0.02149958163499832, 'timestamp': '2025-09-04 04:07:55.058382', 'step': 2508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:07:55.145206', 'step': 2508, 'epoch': 2} {'type': 'loss', 'content': 0.012380536645650864, 'timestamp': '2025-09-04 04:07:55.160522', 'step': 2509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:07:55.270997', 'step': 2509, 'epoch': 2} {'type': 'loss', 'content': 0.0016803938196972013, 'timestamp': '2025-09-04 04:07:55.291308', 'step': 2510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:07:55.396245', 'step': 2510, 'epoch': 2} {'type': 'loss', 'content': 0.015394407324492931, 'timestamp': '2025-09-04 04:07:55.415316', 'step': 2511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:55.518957', 'step': 2511, 'epoch': 2} {'type': 'loss', 'content': 0.005977288819849491, 'timestamp': '2025-09-04 04:07:55.538996', 'step': 2512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:07:55.626412', 'step': 2512, 'epoch': 2} {'type': 'loss', 'content': 0.01742619276046753, 'timestamp': '2025-09-04 04:07:55.644714', 'step': 2513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:55.747687', 'step': 2513, 'epoch': 2} {'type': 'loss', 'content': 0.006442517042160034, 'timestamp': '2025-09-04 04:07:55.767073', 'step': 2514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:55.866978', 'step': 2514, 'epoch': 2} {'type': 'loss', 'content': 0.013937929645180702, 'timestamp': '2025-09-04 04:07:55.885738', 'step': 2515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:07:55.987972', 'step': 2515, 'epoch': 2} {'type': 'loss', 'content': 0.005017734598368406, 'timestamp': '2025-09-04 04:07:56.007994', 'step': 2516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:07:56.125464', 'step': 2516, 'epoch': 2} {'type': 'loss', 'content': 0.002909827046096325, 'timestamp': '2025-09-04 04:07:56.150788', 'step': 2517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:07:56.236953', 'step': 2517, 'epoch': 2} {'type': 'loss', 'content': 0.0124394865706563, 'timestamp': '2025-09-04 04:07:56.252576', 'step': 2518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:07:56.352113', 'step': 2518, 'epoch': 2} {'type': 'loss', 'content': 0.00413033552467823, 'timestamp': '2025-09-04 04:07:56.370492', 'step': 2519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:07:56.470526', 'step': 2519, 'epoch': 2} {'type': 'loss', 'content': 0.044936034828424454, 'timestamp': '2025-09-04 04:07:56.490181', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:04.841099', 'step': 2520, 'epoch': 2} {'type': 'pplx', 'content': 321.3946038942604, 'timestamp': '2025-09-04 04:08:04.843128', 'step': 2520, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2520', 'timestamp': '2025-09-04 04:08:05.181843', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:05.281078', 'step': 2520, 'epoch': 2} {'type': 'loss', 'content': 0.004612304270267487, 'timestamp': '2025-09-04 04:08:05.301969', 'step': 2521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:05.397001', 'step': 2521, 'epoch': 2} {'type': 'loss', 'content': 0.028999704867601395, 'timestamp': '2025-09-04 04:08:05.414484', 'step': 2522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:05.518293', 'step': 2522, 'epoch': 2} {'type': 'loss', 'content': 0.006988754495978355, 'timestamp': '2025-09-04 04:08:05.537400', 'step': 2523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:05.636988', 'step': 2523, 'epoch': 2} {'type': 'loss', 'content': 0.07554414868354797, 'timestamp': '2025-09-04 04:08:05.656374', 'step': 2524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:05.756212', 'step': 2524, 'epoch': 2} {'type': 'loss', 'content': 0.004102020058780909, 'timestamp': '2025-09-04 04:08:05.776850', 'step': 2525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:05.885428', 'step': 2525, 'epoch': 2} {'type': 'loss', 'content': 0.0004931488656438887, 'timestamp': '2025-09-04 04:08:05.904557', 'step': 2526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:06.012440', 'step': 2526, 'epoch': 2} {'type': 'loss', 'content': 0.02074488066136837, 'timestamp': '2025-09-04 04:08:06.032722', 'step': 2527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:08:06.148947', 'step': 2527, 'epoch': 2} {'type': 'loss', 'content': 0.00027644523652270436, 'timestamp': '2025-09-04 04:08:06.171613', 'step': 2528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:06.263638', 'step': 2528, 'epoch': 2} {'type': 'loss', 'content': 0.0038577071391046047, 'timestamp': '2025-09-04 04:08:06.282745', 'step': 2529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:08:06.360604', 'step': 2529, 'epoch': 2} {'type': 'loss', 'content': 0.026262257248163223, 'timestamp': '2025-09-04 04:08:06.374738', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:08:06.481073', 'step': 2530, 'epoch': 2} {'type': 'loss', 'content': 0.01689167320728302, 'timestamp': '2025-09-04 04:08:06.500842', 'step': 2531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:06.603402', 'step': 2531, 'epoch': 2} {'type': 'loss', 'content': 0.04883643984794617, 'timestamp': '2025-09-04 04:08:06.623391', 'step': 2532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:08:06.706852', 'step': 2532, 'epoch': 2} {'type': 'loss', 'content': 0.005458258092403412, 'timestamp': '2025-09-04 04:08:06.723480', 'step': 2533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:08:06.840181', 'step': 2533, 'epoch': 2} {'type': 'loss', 'content': 0.01368082407861948, 'timestamp': '2025-09-04 04:08:06.862251', 'step': 2534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:08:06.938420', 'step': 2534, 'epoch': 2} {'type': 'loss', 'content': 0.008800865150988102, 'timestamp': '2025-09-04 04:08:06.952160', 'step': 2535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:08:07.058634', 'step': 2535, 'epoch': 2} {'type': 'loss', 'content': 0.0017213658429682255, 'timestamp': '2025-09-04 04:08:07.079073', 'step': 2536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:08:07.187701', 'step': 2536, 'epoch': 2} {'type': 'loss', 'content': 0.024919630959630013, 'timestamp': '2025-09-04 04:08:07.210278', 'step': 2537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:07.303713', 'step': 2537, 'epoch': 2} {'type': 'loss', 'content': 0.005876360926777124, 'timestamp': '2025-09-04 04:08:07.320835', 'step': 2538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:08:07.405122', 'step': 2538, 'epoch': 2} {'type': 'loss', 'content': 0.06584823131561279, 'timestamp': '2025-09-04 04:08:07.420373', 'step': 2539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:07.513278', 'step': 2539, 'epoch': 2} {'type': 'loss', 'content': 0.01981205679476261, 'timestamp': '2025-09-04 04:08:07.531141', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:15.901844', 'step': 2540, 'epoch': 2} {'type': 'pplx', 'content': 322.01266845314177, 'timestamp': '2025-09-04 04:08:15.903878', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:08:16.008572', 'step': 2540, 'epoch': 2} {'type': 'loss', 'content': 0.0399375818669796, 'timestamp': '2025-09-04 04:08:16.031089', 'step': 2541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:08:16.140199', 'step': 2541, 'epoch': 2} {'type': 'loss', 'content': 0.004767777398228645, 'timestamp': '2025-09-04 04:08:16.160739', 'step': 2542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:16.256471', 'step': 2542, 'epoch': 2} {'type': 'loss', 'content': 0.02117767557501793, 'timestamp': '2025-09-04 04:08:16.273940', 'step': 2543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:08:16.344595', 'step': 2543, 'epoch': 2} {'type': 'loss', 'content': 0.010128835216164589, 'timestamp': '2025-09-04 04:08:16.358124', 'step': 2544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:08:16.439671', 'step': 2544, 'epoch': 2} {'type': 'loss', 'content': 0.0018931415397673845, 'timestamp': '2025-09-04 04:08:16.456305', 'step': 2545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:16.555368', 'step': 2545, 'epoch': 2} {'type': 'loss', 'content': 0.011107025668025017, 'timestamp': '2025-09-04 04:08:16.573896', 'step': 2546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:16.674332', 'step': 2546, 'epoch': 2} {'type': 'loss', 'content': 0.022087909281253815, 'timestamp': '2025-09-04 04:08:16.693190', 'step': 2547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:16.796126', 'step': 2547, 'epoch': 2} {'type': 'loss', 'content': 0.04829714074730873, 'timestamp': '2025-09-04 04:08:16.816098', 'step': 2548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:16.916158', 'step': 2548, 'epoch': 2} {'type': 'loss', 'content': 0.009213853627443314, 'timestamp': '2025-09-04 04:08:16.936557', 'step': 2549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:08:17.043596', 'step': 2549, 'epoch': 2} {'type': 'loss', 'content': 0.029857391491532326, 'timestamp': '2025-09-04 04:08:17.063569', 'step': 2550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:17.167443', 'step': 2550, 'epoch': 2} {'type': 'loss', 'content': 0.004018782638013363, 'timestamp': '2025-09-04 04:08:17.186721', 'step': 2551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:08:17.261179', 'step': 2551, 'epoch': 2} {'type': 'loss', 'content': 0.029023800045251846, 'timestamp': '2025-09-04 04:08:17.275555', 'step': 2552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:17.366789', 'step': 2552, 'epoch': 2} {'type': 'loss', 'content': 0.026715071871876717, 'timestamp': '2025-09-04 04:08:17.388002', 'step': 2553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:08:17.540389', 'step': 2553, 'epoch': 2} {'type': 'loss', 'content': 0.048667825758457184, 'timestamp': '2025-09-04 04:08:17.560427', 'step': 2554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:08:17.700200', 'step': 2554, 'epoch': 2} {'type': 'loss', 'content': 0.02002848871052265, 'timestamp': '2025-09-04 04:08:17.715659', 'step': 2555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:17.829153', 'step': 2555, 'epoch': 2} {'type': 'loss', 'content': 0.015101251192390919, 'timestamp': '2025-09-04 04:08:17.847046', 'step': 2556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:08:17.937680', 'step': 2556, 'epoch': 2} {'type': 'loss', 'content': 0.001839878037571907, 'timestamp': '2025-09-04 04:08:17.956055', 'step': 2557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:08:18.033760', 'step': 2557, 'epoch': 2} {'type': 'loss', 'content': 0.0144809540361166, 'timestamp': '2025-09-04 04:08:18.047547', 'step': 2558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:18.152111', 'step': 2558, 'epoch': 2} {'type': 'loss', 'content': 0.002240030327811837, 'timestamp': '2025-09-04 04:08:18.171263', 'step': 2559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:18.273966', 'step': 2559, 'epoch': 2} {'type': 'loss', 'content': 0.006072205025702715, 'timestamp': '2025-09-04 04:08:18.293971', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:26.673676', 'step': 2560, 'epoch': 2} {'type': 'pplx', 'content': 321.77623584602946, 'timestamp': '2025-09-04 04:08:26.675704', 'step': 2560, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2560', 'timestamp': '2025-09-04 04:08:27.019902', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:08:27.100761', 'step': 2560, 'epoch': 2} {'type': 'loss', 'content': 0.025917798280715942, 'timestamp': '2025-09-04 04:08:27.117323', 'step': 2561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:27.225582', 'step': 2561, 'epoch': 2} {'type': 'loss', 'content': 0.025179943069815636, 'timestamp': '2025-09-04 04:08:27.245639', 'step': 2562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:27.347057', 'step': 2562, 'epoch': 2} {'type': 'loss', 'content': 0.014063333161175251, 'timestamp': '2025-09-04 04:08:27.366276', 'step': 2563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:27.460070', 'step': 2563, 'epoch': 2} {'type': 'loss', 'content': 0.0029370649717748165, 'timestamp': '2025-09-04 04:08:27.477708', 'step': 2564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:27.582741', 'step': 2564, 'epoch': 2} {'type': 'loss', 'content': 0.014166107401251793, 'timestamp': '2025-09-04 04:08:27.604754', 'step': 2565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:08:27.715409', 'step': 2565, 'epoch': 2} {'type': 'loss', 'content': 0.06013331934809685, 'timestamp': '2025-09-04 04:08:27.736200', 'step': 2566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:08:27.881746', 'step': 2566, 'epoch': 2} {'type': 'loss', 'content': 0.008028823882341385, 'timestamp': '2025-09-04 04:08:27.909876', 'step': 2567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:28.010271', 'step': 2567, 'epoch': 2} {'type': 'loss', 'content': 0.04439549893140793, 'timestamp': '2025-09-04 04:08:28.029939', 'step': 2568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:28.120319', 'step': 2568, 'epoch': 2} {'type': 'loss', 'content': 0.001983765745535493, 'timestamp': '2025-09-04 04:08:28.139120', 'step': 2569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:08:28.222574', 'step': 2569, 'epoch': 2} {'type': 'loss', 'content': 0.13214930891990662, 'timestamp': '2025-09-04 04:08:28.237917', 'step': 2570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:28.341573', 'step': 2570, 'epoch': 2} {'type': 'loss', 'content': 0.02340877056121826, 'timestamp': '2025-09-04 04:08:28.360697', 'step': 2571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:08:28.437650', 'step': 2571, 'epoch': 2} {'type': 'loss', 'content': 0.03975323215126991, 'timestamp': '2025-09-04 04:08:28.452202', 'step': 2572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:08:28.534148', 'step': 2572, 'epoch': 2} {'type': 'loss', 'content': 0.013518854975700378, 'timestamp': '2025-09-04 04:08:28.551256', 'step': 2573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:28.660249', 'step': 2573, 'epoch': 2} {'type': 'loss', 'content': 0.039588313549757004, 'timestamp': '2025-09-04 04:08:28.680318', 'step': 2574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:08:28.788512', 'step': 2574, 'epoch': 2} {'type': 'loss', 'content': 0.002232232363894582, 'timestamp': '2025-09-04 04:08:28.809195', 'step': 2575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:28.901049', 'step': 2575, 'epoch': 2} {'type': 'loss', 'content': 0.015773437917232513, 'timestamp': '2025-09-04 04:08:28.919005', 'step': 2576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:29.009843', 'step': 2576, 'epoch': 2} {'type': 'loss', 'content': 0.01339112501591444, 'timestamp': '2025-09-04 04:08:29.028845', 'step': 2577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:29.130668', 'step': 2577, 'epoch': 2} {'type': 'loss', 'content': 0.016942497342824936, 'timestamp': '2025-09-04 04:08:29.149712', 'step': 2578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:08:29.223347', 'step': 2578, 'epoch': 2} {'type': 'loss', 'content': 0.024486076086759567, 'timestamp': '2025-09-04 04:08:29.237004', 'step': 2579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:08:29.320887', 'step': 2579, 'epoch': 2} {'type': 'loss', 'content': 0.03973688185214996, 'timestamp': '2025-09-04 04:08:29.337239', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:37.715930', 'step': 2580, 'epoch': 2} {'type': 'pplx', 'content': 317.32499137431523, 'timestamp': '2025-09-04 04:08:37.718369', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:37.815580', 'step': 2580, 'epoch': 2} {'type': 'loss', 'content': 0.004058394581079483, 'timestamp': '2025-09-04 04:08:37.836637', 'step': 2581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:08:37.923125', 'step': 2581, 'epoch': 2} {'type': 'loss', 'content': 0.006692761089652777, 'timestamp': '2025-09-04 04:08:37.938509', 'step': 2582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:38.034616', 'step': 2582, 'epoch': 2} {'type': 'loss', 'content': 0.01003364846110344, 'timestamp': '2025-09-04 04:08:38.052107', 'step': 2583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:38.147127', 'step': 2583, 'epoch': 2} {'type': 'loss', 'content': 0.0015831181081011891, 'timestamp': '2025-09-04 04:08:38.165025', 'step': 2584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:38.257632', 'step': 2584, 'epoch': 2} {'type': 'loss', 'content': 0.02085372433066368, 'timestamp': '2025-09-04 04:08:38.276733', 'step': 2585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:38.379441', 'step': 2585, 'epoch': 2} {'type': 'loss', 'content': 0.012194113805890083, 'timestamp': '2025-09-04 04:08:38.398062', 'step': 2586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:38.497361', 'step': 2586, 'epoch': 2} {'type': 'loss', 'content': 0.020145049318671227, 'timestamp': '2025-09-04 04:08:38.515933', 'step': 2587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:38.611335', 'step': 2587, 'epoch': 2} {'type': 'loss', 'content': 0.01815951056778431, 'timestamp': '2025-09-04 04:08:38.629608', 'step': 2588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:08:38.737350', 'step': 2588, 'epoch': 2} {'type': 'loss', 'content': 0.0061141857877373695, 'timestamp': '2025-09-04 04:08:38.760046', 'step': 2589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:08:38.843486', 'step': 2589, 'epoch': 2} {'type': 'loss', 'content': 0.04689551517367363, 'timestamp': '2025-09-04 04:08:38.858515', 'step': 2590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:38.962843', 'step': 2590, 'epoch': 2} {'type': 'loss', 'content': 0.005039518233388662, 'timestamp': '2025-09-04 04:08:38.982095', 'step': 2591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:39.083944', 'step': 2591, 'epoch': 2} {'type': 'loss', 'content': 0.003636001143604517, 'timestamp': '2025-09-04 04:08:39.103553', 'step': 2592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:39.194471', 'step': 2592, 'epoch': 2} {'type': 'loss', 'content': 0.0033800839446485043, 'timestamp': '2025-09-04 04:08:39.213259', 'step': 2593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:08:39.323734', 'step': 2593, 'epoch': 2} {'type': 'loss', 'content': 0.01337014976888895, 'timestamp': '2025-09-04 04:08:39.344344', 'step': 2594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:39.445586', 'step': 2594, 'epoch': 2} {'type': 'loss', 'content': 0.003958144225180149, 'timestamp': '2025-09-04 04:08:39.464458', 'step': 2595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 04:08:39.598304', 'step': 2595, 'epoch': 2} {'type': 'loss', 'content': 0.004944264888763428, 'timestamp': '2025-09-04 04:08:39.623718', 'step': 2596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:39.715344', 'step': 2596, 'epoch': 2} {'type': 'loss', 'content': 0.012598078697919846, 'timestamp': '2025-09-04 04:08:39.734053', 'step': 2597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:08:39.811419', 'step': 2597, 'epoch': 2} {'type': 'loss', 'content': 0.010607503354549408, 'timestamp': '2025-09-04 04:08:39.825419', 'step': 2598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:08:39.918935', 'step': 2598, 'epoch': 2} {'type': 'loss', 'content': 0.05367982015013695, 'timestamp': '2025-09-04 04:08:39.936039', 'step': 2599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:08:40.045928', 'step': 2599, 'epoch': 2} {'type': 'loss', 'content': 0.0185086727142334, 'timestamp': '2025-09-04 04:08:40.067205', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:48.439146', 'step': 2600, 'epoch': 2} {'type': 'pplx', 'content': 314.9293203900891, 'timestamp': '2025-09-04 04:08:48.441562', 'step': 2600, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2600', 'timestamp': '2025-09-04 04:08:48.943661', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:08:49.017512', 'step': 2600, 'epoch': 3} {'type': 'loss', 'content': 0.002318183658644557, 'timestamp': '2025-09-04 04:08:49.032219', 'step': 2601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:49.134643', 'step': 2601, 'epoch': 3} {'type': 'loss', 'content': 0.02105123922228813, 'timestamp': '2025-09-04 04:08:49.153886', 'step': 2602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:08:49.255669', 'step': 2602, 'epoch': 3} {'type': 'loss', 'content': 0.02376515232026577, 'timestamp': '2025-09-04 04:08:49.274554', 'step': 2603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:08:49.378640', 'step': 2603, 'epoch': 3} {'type': 'loss', 'content': 0.027340862900018692, 'timestamp': '2025-09-04 04:08:49.398723', 'step': 2604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:49.490403', 'step': 2604, 'epoch': 3} {'type': 'loss', 'content': 0.007925122044980526, 'timestamp': '2025-09-04 04:08:49.509557', 'step': 2605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:49.603779', 'step': 2605, 'epoch': 3} {'type': 'loss', 'content': 0.006013544742017984, 'timestamp': '2025-09-04 04:08:49.621155', 'step': 2606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:49.733300', 'step': 2606, 'epoch': 3} {'type': 'loss', 'content': 0.011563356965780258, 'timestamp': '2025-09-04 04:08:49.753611', 'step': 2607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:08:49.856580', 'step': 2607, 'epoch': 3} {'type': 'loss', 'content': 0.006092959549278021, 'timestamp': '2025-09-04 04:08:49.876556', 'step': 2608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:08:49.980849', 'step': 2608, 'epoch': 3} {'type': 'loss', 'content': 0.01699855737388134, 'timestamp': '2025-09-04 04:08:50.002771', 'step': 2609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:08:50.097356', 'step': 2609, 'epoch': 3} {'type': 'loss', 'content': 0.002047803020104766, 'timestamp': '2025-09-04 04:08:50.114758', 'step': 2610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:08:50.187652', 'step': 2610, 'epoch': 3} {'type': 'loss', 'content': 0.028143590316176414, 'timestamp': '2025-09-04 04:08:50.200565', 'step': 2611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:50.295727', 'step': 2611, 'epoch': 3} {'type': 'loss', 'content': 0.0071517350152134895, 'timestamp': '2025-09-04 04:08:50.313984', 'step': 2612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:08:50.410801', 'step': 2612, 'epoch': 3} {'type': 'loss', 'content': 0.01916099339723587, 'timestamp': '2025-09-04 04:08:50.431216', 'step': 2613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:08:50.518259', 'step': 2613, 'epoch': 3} {'type': 'loss', 'content': 0.006362794432789087, 'timestamp': '2025-09-04 04:08:50.533877', 'step': 2614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:50.643528', 'step': 2614, 'epoch': 3} {'type': 'loss', 'content': 0.006359405815601349, 'timestamp': '2025-09-04 04:08:50.663843', 'step': 2615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:08:50.750539', 'step': 2615, 'epoch': 3} {'type': 'loss', 'content': 0.0024103245232254267, 'timestamp': '2025-09-04 04:08:50.766946', 'step': 2616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:08:50.859755', 'step': 2616, 'epoch': 3} {'type': 'loss', 'content': 0.007830057293176651, 'timestamp': '2025-09-04 04:08:50.878988', 'step': 2617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:08:50.956543', 'step': 2617, 'epoch': 3} {'type': 'loss', 'content': 0.05507553741335869, 'timestamp': '2025-09-04 04:08:50.970499', 'step': 2618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:08:51.042830', 'step': 2618, 'epoch': 3} {'type': 'loss', 'content': 0.007564071100205183, 'timestamp': '2025-09-04 04:08:51.055769', 'step': 2619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:08:51.165873', 'step': 2619, 'epoch': 3} {'type': 'loss', 'content': 0.0021915908437222242, 'timestamp': '2025-09-04 04:08:51.187298', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:08:59.590829', 'step': 2620, 'epoch': 3} {'type': 'pplx', 'content': 315.285162165603, 'timestamp': '2025-09-04 04:08:59.592803', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:08:59.696649', 'step': 2620, 'epoch': 3} {'type': 'loss', 'content': 0.0060343146324157715, 'timestamp': '2025-09-04 04:08:59.718904', 'step': 2621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1392], 'flops': 27840169073088.0}, 'timestamp': '2025-09-04 04:08:59.923521', 'step': 2621, 'epoch': 3} {'type': 'loss', 'content': 0.0053197117522358894, 'timestamp': '2025-09-04 04:08:59.963010', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:09:00.073876', 'step': 2622, 'epoch': 3} {'type': 'loss', 'content': 0.020664365962147713, 'timestamp': '2025-09-04 04:09:00.094500', 'step': 2623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 04:09:00.309867', 'step': 2623, 'epoch': 3} {'type': 'loss', 'content': 0.015484297648072243, 'timestamp': '2025-09-04 04:09:00.351553', 'step': 2624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:00.452502', 'step': 2624, 'epoch': 3} {'type': 'loss', 'content': 0.002410691697150469, 'timestamp': '2025-09-04 04:09:00.473673', 'step': 2625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:00.579481', 'step': 2625, 'epoch': 3} {'type': 'loss', 'content': 0.019456349313259125, 'timestamp': '2025-09-04 04:09:00.599522', 'step': 2626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:00.690498', 'step': 2626, 'epoch': 3} {'type': 'loss', 'content': 0.008297095075249672, 'timestamp': '2025-09-04 04:09:00.707257', 'step': 2627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:00.807017', 'step': 2627, 'epoch': 3} {'type': 'loss', 'content': 0.003302738768979907, 'timestamp': '2025-09-04 04:09:00.826356', 'step': 2628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:00.932212', 'step': 2628, 'epoch': 3} {'type': 'loss', 'content': 0.11121071875095367, 'timestamp': '2025-09-04 04:09:00.954440', 'step': 2629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:09:01.068831', 'step': 2629, 'epoch': 3} {'type': 'loss', 'content': 0.02678021229803562, 'timestamp': '2025-09-04 04:09:01.089309', 'step': 2630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:01.195192', 'step': 2630, 'epoch': 3} {'type': 'loss', 'content': 0.00880197249352932, 'timestamp': '2025-09-04 04:09:01.214228', 'step': 2631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:01.308951', 'step': 2631, 'epoch': 3} {'type': 'loss', 'content': 0.011775809340178967, 'timestamp': '2025-09-04 04:09:01.326840', 'step': 2632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:09:01.400378', 'step': 2632, 'epoch': 3} {'type': 'loss', 'content': 0.010084496811032295, 'timestamp': '2025-09-04 04:09:01.415169', 'step': 2633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:01.504764', 'step': 2633, 'epoch': 3} {'type': 'loss', 'content': 0.001346323057077825, 'timestamp': '2025-09-04 04:09:01.521617', 'step': 2634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:01.623955', 'step': 2634, 'epoch': 3} {'type': 'loss', 'content': 0.016563931480050087, 'timestamp': '2025-09-04 04:09:01.643140', 'step': 2635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:01.739409', 'step': 2635, 'epoch': 3} {'type': 'loss', 'content': 0.002511340891942382, 'timestamp': '2025-09-04 04:09:01.757733', 'step': 2636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:01.863630', 'step': 2636, 'epoch': 3} {'type': 'loss', 'content': 0.1092916801571846, 'timestamp': '2025-09-04 04:09:01.886104', 'step': 2637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:01.995722', 'step': 2637, 'epoch': 3} {'type': 'loss', 'content': 0.016762439161539078, 'timestamp': '2025-09-04 04:09:02.016030', 'step': 2638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:09:02.099780', 'step': 2638, 'epoch': 3} {'type': 'loss', 'content': 0.0016962930094450712, 'timestamp': '2025-09-04 04:09:02.115114', 'step': 2639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:09:02.191928', 'step': 2639, 'epoch': 3} {'type': 'loss', 'content': 0.01071829255670309, 'timestamp': '2025-09-04 04:09:02.206841', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:09:10.584308', 'step': 2640, 'epoch': 3} {'type': 'pplx', 'content': 312.75249298706785, 'timestamp': '2025-09-04 04:09:10.586294', 'step': 2640, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2640', 'timestamp': '2025-09-04 04:09:10.941242', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:09:11.023411', 'step': 2640, 'epoch': 3} {'type': 'loss', 'content': 0.014106813818216324, 'timestamp': '2025-09-04 04:09:11.039997', 'step': 2641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:09:11.139999', 'step': 2641, 'epoch': 3} {'type': 'loss', 'content': 0.005073550622910261, 'timestamp': '2025-09-04 04:09:11.158856', 'step': 2642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:11.265331', 'step': 2642, 'epoch': 3} {'type': 'loss', 'content': 0.013524402864277363, 'timestamp': '2025-09-04 04:09:11.285345', 'step': 2643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:11.368162', 'step': 2643, 'epoch': 3} {'type': 'loss', 'content': 0.009907763451337814, 'timestamp': '2025-09-04 04:09:11.384138', 'step': 2644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:09:11.457840', 'step': 2644, 'epoch': 3} {'type': 'loss', 'content': 0.002835202729329467, 'timestamp': '2025-09-04 04:09:11.472809', 'step': 2645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:11.581086', 'step': 2645, 'epoch': 3} {'type': 'loss', 'content': 0.0016281316056847572, 'timestamp': '2025-09-04 04:09:11.601399', 'step': 2646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:11.704336', 'step': 2646, 'epoch': 3} {'type': 'loss', 'content': 0.0024171601980924606, 'timestamp': '2025-09-04 04:09:11.723618', 'step': 2647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:11.819889', 'step': 2647, 'epoch': 3} {'type': 'loss', 'content': 0.002636190503835678, 'timestamp': '2025-09-04 04:09:11.837434', 'step': 2648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:11.934303', 'step': 2648, 'epoch': 3} {'type': 'loss', 'content': 0.00617353105917573, 'timestamp': '2025-09-04 04:09:11.954720', 'step': 2649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:12.047604', 'step': 2649, 'epoch': 3} {'type': 'loss', 'content': 0.0027180935721844435, 'timestamp': '2025-09-04 04:09:12.064695', 'step': 2650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:12.166834', 'step': 2650, 'epoch': 3} {'type': 'loss', 'content': 0.004888160619884729, 'timestamp': '2025-09-04 04:09:12.186041', 'step': 2651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:12.269214', 'step': 2651, 'epoch': 3} {'type': 'loss', 'content': 0.00197249511256814, 'timestamp': '2025-09-04 04:09:12.285048', 'step': 2652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:12.388267', 'step': 2652, 'epoch': 3} {'type': 'loss', 'content': 0.0006813482032157481, 'timestamp': '2025-09-04 04:09:12.410251', 'step': 2653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:12.505553', 'step': 2653, 'epoch': 3} {'type': 'loss', 'content': 0.0030994487460702658, 'timestamp': '2025-09-04 04:09:12.523054', 'step': 2654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:12.612906', 'step': 2654, 'epoch': 3} {'type': 'loss', 'content': 0.002902757376432419, 'timestamp': '2025-09-04 04:09:12.629822', 'step': 2655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:12.707648', 'step': 2655, 'epoch': 3} {'type': 'loss', 'content': 0.0018415531376376748, 'timestamp': '2025-09-04 04:09:12.722595', 'step': 2656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:09:12.820427', 'step': 2656, 'epoch': 3} {'type': 'loss', 'content': 0.015612171031534672, 'timestamp': '2025-09-04 04:09:12.841224', 'step': 2657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:12.941218', 'step': 2657, 'epoch': 3} {'type': 'loss', 'content': 0.042000770568847656, 'timestamp': '2025-09-04 04:09:12.959702', 'step': 2658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:13.054817', 'step': 2658, 'epoch': 3} {'type': 'loss', 'content': 0.0024185827933251858, 'timestamp': '2025-09-04 04:09:13.072416', 'step': 2659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:13.178902', 'step': 2659, 'epoch': 3} {'type': 'loss', 'content': 0.02803855948150158, 'timestamp': '2025-09-04 04:09:13.199805', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:09:21.711234', 'step': 2660, 'epoch': 3} {'type': 'pplx', 'content': 310.74890931755016, 'timestamp': '2025-09-04 04:09:21.713177', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:21.805471', 'step': 2660, 'epoch': 3} {'type': 'loss', 'content': 0.0010291390353813767, 'timestamp': '2025-09-04 04:09:21.824559', 'step': 2661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:09:21.934579', 'step': 2661, 'epoch': 3} {'type': 'loss', 'content': 0.005789092276245356, 'timestamp': '2025-09-04 04:09:21.955070', 'step': 2662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:22.046012', 'step': 2662, 'epoch': 3} {'type': 'loss', 'content': 0.015148711390793324, 'timestamp': '2025-09-04 04:09:22.062794', 'step': 2663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 04:09:22.282186', 'step': 2663, 'epoch': 3} {'type': 'loss', 'content': 0.004448336083441973, 'timestamp': '2025-09-04 04:09:22.325169', 'step': 2664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:09:22.417654', 'step': 2664, 'epoch': 3} {'type': 'loss', 'content': 0.0008569728815928102, 'timestamp': '2025-09-04 04:09:22.436738', 'step': 2665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:22.545585', 'step': 2665, 'epoch': 3} {'type': 'loss', 'content': 0.034380171447992325, 'timestamp': '2025-09-04 04:09:22.565817', 'step': 2666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 04:09:22.695282', 'step': 2666, 'epoch': 3} {'type': 'loss', 'content': 0.020487092435359955, 'timestamp': '2025-09-04 04:09:22.718863', 'step': 2667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:22.824617', 'step': 2667, 'epoch': 3} {'type': 'loss', 'content': 0.004988936707377434, 'timestamp': '2025-09-04 04:09:22.845347', 'step': 2668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:09:22.928065', 'step': 2668, 'epoch': 3} {'type': 'loss', 'content': 0.0038620613049715757, 'timestamp': '2025-09-04 04:09:22.944722', 'step': 2669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:23.048743', 'step': 2669, 'epoch': 3} {'type': 'loss', 'content': 0.018743831664323807, 'timestamp': '2025-09-04 04:09:23.068031', 'step': 2670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:23.146345', 'step': 2670, 'epoch': 3} {'type': 'loss', 'content': 0.044233791530132294, 'timestamp': '2025-09-04 04:09:23.160449', 'step': 2671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:23.268352', 'step': 2671, 'epoch': 3} {'type': 'loss', 'content': 0.004456370137631893, 'timestamp': '2025-09-04 04:09:23.289464', 'step': 2672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:23.386870', 'step': 2672, 'epoch': 3} {'type': 'loss', 'content': 0.08841562271118164, 'timestamp': '2025-09-04 04:09:23.407241', 'step': 2673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:09:23.518418', 'step': 2673, 'epoch': 3} {'type': 'loss', 'content': 0.005260918755084276, 'timestamp': '2025-09-04 04:09:23.539009', 'step': 2674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:23.645223', 'step': 2674, 'epoch': 3} {'type': 'loss', 'content': 0.009504856541752815, 'timestamp': '2025-09-04 04:09:23.665320', 'step': 2675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:09:23.749594', 'step': 2675, 'epoch': 3} {'type': 'loss', 'content': 0.0136948861181736, 'timestamp': '2025-09-04 04:09:23.765576', 'step': 2676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:23.847090', 'step': 2676, 'epoch': 3} {'type': 'loss', 'content': 0.00021050203940831125, 'timestamp': '2025-09-04 04:09:23.863709', 'step': 2677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:23.963423', 'step': 2677, 'epoch': 3} {'type': 'loss', 'content': 0.007187894079834223, 'timestamp': '2025-09-04 04:09:23.981938', 'step': 2678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:09:24.093209', 'step': 2678, 'epoch': 3} {'type': 'loss', 'content': 0.02946031652390957, 'timestamp': '2025-09-04 04:09:24.113901', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:24.220814', 'step': 2679, 'epoch': 3} {'type': 'loss', 'content': 0.004950105212628841, 'timestamp': '2025-09-04 04:09:24.241589', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:09:32.746359', 'step': 2680, 'epoch': 3} {'type': 'pplx', 'content': 310.73999203780255, 'timestamp': '2025-09-04 04:09:32.748816', 'step': 2680, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2680', 'timestamp': '2025-09-04 04:09:33.104178', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:33.210287', 'step': 2680, 'epoch': 3} {'type': 'loss', 'content': 0.02347641810774803, 'timestamp': '2025-09-04 04:09:33.232593', 'step': 2681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:33.327629', 'step': 2681, 'epoch': 3} {'type': 'loss', 'content': 0.019025633111596107, 'timestamp': '2025-09-04 04:09:33.345199', 'step': 2682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:33.447778', 'step': 2682, 'epoch': 3} {'type': 'loss', 'content': 0.0245287474244833, 'timestamp': '2025-09-04 04:09:33.467127', 'step': 2683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:33.575344', 'step': 2683, 'epoch': 3} {'type': 'loss', 'content': 0.04559353366494179, 'timestamp': '2025-09-04 04:09:33.596103', 'step': 2684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:33.689138', 'step': 2684, 'epoch': 3} {'type': 'loss', 'content': 0.08233796805143356, 'timestamp': '2025-09-04 04:09:33.708433', 'step': 2685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:09:33.795837', 'step': 2685, 'epoch': 3} {'type': 'loss', 'content': 0.0036527463234961033, 'timestamp': '2025-09-04 04:09:33.811533', 'step': 2686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:33.915424', 'step': 2686, 'epoch': 3} {'type': 'loss', 'content': 0.016142617911100388, 'timestamp': '2025-09-04 04:09:33.934707', 'step': 2687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:34.014124', 'step': 2687, 'epoch': 3} {'type': 'loss', 'content': 0.029892858117818832, 'timestamp': '2025-09-04 04:09:34.029190', 'step': 2688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:09:34.099577', 'step': 2688, 'epoch': 3} {'type': 'loss', 'content': 0.009370243176817894, 'timestamp': '2025-09-04 04:09:34.113742', 'step': 2689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:34.192195', 'step': 2689, 'epoch': 3} {'type': 'loss', 'content': 0.017486093565821648, 'timestamp': '2025-09-04 04:09:34.206344', 'step': 2690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:34.309269', 'step': 2690, 'epoch': 3} {'type': 'loss', 'content': 0.02338462322950363, 'timestamp': '2025-09-04 04:09:34.328488', 'step': 2691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:09:34.417191', 'step': 2691, 'epoch': 3} {'type': 'loss', 'content': 0.048346079885959625, 'timestamp': '2025-09-04 04:09:34.433483', 'step': 2692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:34.524648', 'step': 2692, 'epoch': 3} {'type': 'loss', 'content': 0.017204314470291138, 'timestamp': '2025-09-04 04:09:34.543518', 'step': 2693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:09:34.666403', 'step': 2693, 'epoch': 3} {'type': 'loss', 'content': 0.00045050657354295254, 'timestamp': '2025-09-04 04:09:34.689538', 'step': 2694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:34.774209', 'step': 2694, 'epoch': 3} {'type': 'loss', 'content': 0.01556316763162613, 'timestamp': '2025-09-04 04:09:34.789446', 'step': 2695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:34.888274', 'step': 2695, 'epoch': 3} {'type': 'loss', 'content': 0.04271606728434563, 'timestamp': '2025-09-04 04:09:34.907744', 'step': 2696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:09:35.006125', 'step': 2696, 'epoch': 3} {'type': 'loss', 'content': 0.008685296401381493, 'timestamp': '2025-09-04 04:09:35.026917', 'step': 2697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:09:35.133281', 'step': 2697, 'epoch': 3} {'type': 'loss', 'content': 0.0019303993321955204, 'timestamp': '2025-09-04 04:09:35.153384', 'step': 2698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:35.252519', 'step': 2698, 'epoch': 3} {'type': 'loss', 'content': 0.01800696924328804, 'timestamp': '2025-09-04 04:09:35.271186', 'step': 2699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:35.365532', 'step': 2699, 'epoch': 3} {'type': 'loss', 'content': 0.01136024296283722, 'timestamp': '2025-09-04 04:09:35.383767', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:09:43.928592', 'step': 2700, 'epoch': 3} {'type': 'pplx', 'content': 313.122110339322, 'timestamp': '2025-09-04 04:09:43.930940', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 04:09:44.064080', 'step': 2700, 'epoch': 3} {'type': 'loss', 'content': 0.011454450897872448, 'timestamp': '2025-09-04 04:09:44.092892', 'step': 2701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:44.185228', 'step': 2701, 'epoch': 3} {'type': 'loss', 'content': 0.031085196882486343, 'timestamp': '2025-09-04 04:09:44.201909', 'step': 2702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:09:44.304243', 'step': 2702, 'epoch': 3} {'type': 'loss', 'content': 0.006320777349174023, 'timestamp': '2025-09-04 04:09:44.322922', 'step': 2703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:44.427439', 'step': 2703, 'epoch': 3} {'type': 'loss', 'content': 0.012303713709115982, 'timestamp': '2025-09-04 04:09:44.447172', 'step': 2704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:09:44.531985', 'step': 2704, 'epoch': 3} {'type': 'loss', 'content': 0.004898655693978071, 'timestamp': '2025-09-04 04:09:44.548628', 'step': 2705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:09:44.662833', 'step': 2705, 'epoch': 3} {'type': 'loss', 'content': 0.01601606048643589, 'timestamp': '2025-09-04 04:09:44.681951', 'step': 2706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:09:44.767300', 'step': 2706, 'epoch': 3} {'type': 'loss', 'content': 0.011142881587147713, 'timestamp': '2025-09-04 04:09:44.782310', 'step': 2707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:09:44.901063', 'step': 2707, 'epoch': 3} {'type': 'loss', 'content': 0.01005838718265295, 'timestamp': '2025-09-04 04:09:44.923827', 'step': 2708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:45.024411', 'step': 2708, 'epoch': 3} {'type': 'loss', 'content': 0.05936342850327492, 'timestamp': '2025-09-04 04:09:45.044274', 'step': 2709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:45.156582', 'step': 2709, 'epoch': 3} {'type': 'loss', 'content': 0.008189283311367035, 'timestamp': '2025-09-04 04:09:45.173064', 'step': 2710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:09:45.285504', 'step': 2710, 'epoch': 3} {'type': 'loss', 'content': 0.10139759629964828, 'timestamp': '2025-09-04 04:09:45.305622', 'step': 2711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:45.407014', 'step': 2711, 'epoch': 3} {'type': 'loss', 'content': 0.023838041350245476, 'timestamp': '2025-09-04 04:09:45.426287', 'step': 2712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:45.519681', 'step': 2712, 'epoch': 3} {'type': 'loss', 'content': 0.02069295570254326, 'timestamp': '2025-09-04 04:09:45.538280', 'step': 2713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:09:45.616632', 'step': 2713, 'epoch': 3} {'type': 'loss', 'content': 0.07725197076797485, 'timestamp': '2025-09-04 04:09:45.630199', 'step': 2714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:45.725539', 'step': 2714, 'epoch': 3} {'type': 'loss', 'content': 0.006542861927300692, 'timestamp': '2025-09-04 04:09:45.742578', 'step': 2715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:45.847195', 'step': 2715, 'epoch': 3} {'type': 'loss', 'content': 0.03040933422744274, 'timestamp': '2025-09-04 04:09:45.867057', 'step': 2716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:09:45.961569', 'step': 2716, 'epoch': 3} {'type': 'loss', 'content': 0.0011650609085336328, 'timestamp': '2025-09-04 04:09:45.980564', 'step': 2717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:46.091204', 'step': 2717, 'epoch': 3} {'type': 'loss', 'content': 0.020016556605696678, 'timestamp': '2025-09-04 04:09:46.111056', 'step': 2718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:46.213969', 'step': 2718, 'epoch': 3} {'type': 'loss', 'content': 0.0162852443754673, 'timestamp': '2025-09-04 04:09:46.231413', 'step': 2719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:09:46.352029', 'step': 2719, 'epoch': 3} {'type': 'loss', 'content': 0.0035490740556269884, 'timestamp': '2025-09-04 04:09:46.374802', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:09:54.902522', 'step': 2720, 'epoch': 3} {'type': 'pplx', 'content': 314.5356204228338, 'timestamp': '2025-09-04 04:09:54.905530', 'step': 2720, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2720', 'timestamp': '2025-09-04 04:09:55.409233', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 04:09:55.588923', 'step': 2720, 'epoch': 3} {'type': 'loss', 'content': 0.0011699828319251537, 'timestamp': '2025-09-04 04:09:55.626999', 'step': 2721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:55.710243', 'step': 2721, 'epoch': 3} {'type': 'loss', 'content': 0.008469424210488796, 'timestamp': '2025-09-04 04:09:55.725330', 'step': 2722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:09:55.843055', 'step': 2722, 'epoch': 3} {'type': 'loss', 'content': 0.005703017581254244, 'timestamp': '2025-09-04 04:09:55.865254', 'step': 2723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:09:55.940919', 'step': 2723, 'epoch': 3} {'type': 'loss', 'content': 0.06145107373595238, 'timestamp': '2025-09-04 04:09:55.955263', 'step': 2724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:56.031260', 'step': 2724, 'epoch': 3} {'type': 'loss', 'content': 0.021033862605690956, 'timestamp': '2025-09-04 04:09:56.046734', 'step': 2725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:09:56.138352', 'step': 2725, 'epoch': 3} {'type': 'loss', 'content': 0.009126712568104267, 'timestamp': '2025-09-04 04:09:56.155147', 'step': 2726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:09:56.272076', 'step': 2726, 'epoch': 3} {'type': 'loss', 'content': 0.009277377277612686, 'timestamp': '2025-09-04 04:09:56.292646', 'step': 2727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:09:56.386013', 'step': 2727, 'epoch': 3} {'type': 'loss', 'content': 0.0015412485226988792, 'timestamp': '2025-09-04 04:09:56.404221', 'step': 2728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:09:56.499794', 'step': 2728, 'epoch': 3} {'type': 'loss', 'content': 0.02090812474489212, 'timestamp': '2025-09-04 04:09:56.520175', 'step': 2729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:09:56.605490', 'step': 2729, 'epoch': 3} {'type': 'loss', 'content': 0.010399392805993557, 'timestamp': '2025-09-04 04:09:56.620588', 'step': 2730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:09:56.730598', 'step': 2730, 'epoch': 3} {'type': 'loss', 'content': 0.0056745195761322975, 'timestamp': '2025-09-04 04:09:56.750912', 'step': 2731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:09:56.844525', 'step': 2731, 'epoch': 3} {'type': 'loss', 'content': 0.13878655433654785, 'timestamp': '2025-09-04 04:09:56.862397', 'step': 2732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:09:56.938712', 'step': 2732, 'epoch': 3} {'type': 'loss', 'content': 0.016388939693570137, 'timestamp': '2025-09-04 04:09:56.954158', 'step': 2733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:09:57.053909', 'step': 2733, 'epoch': 3} {'type': 'loss', 'content': 0.011486927047371864, 'timestamp': '2025-09-04 04:09:57.072709', 'step': 2734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:09:57.189227', 'step': 2734, 'epoch': 3} {'type': 'loss', 'content': 0.007860065437853336, 'timestamp': '2025-09-04 04:09:57.211366', 'step': 2735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:09:57.322583', 'step': 2735, 'epoch': 3} {'type': 'loss', 'content': 0.013591518625617027, 'timestamp': '2025-09-04 04:09:57.343880', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:57.445097', 'step': 2736, 'epoch': 3} {'type': 'loss', 'content': 0.002234839601442218, 'timestamp': '2025-09-04 04:09:57.466151', 'step': 2737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:57.568464', 'step': 2737, 'epoch': 3} {'type': 'loss', 'content': 0.05005452781915665, 'timestamp': '2025-09-04 04:09:57.585998', 'step': 2738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:09:57.689780', 'step': 2738, 'epoch': 3} {'type': 'loss', 'content': 0.02009434998035431, 'timestamp': '2025-09-04 04:09:57.708905', 'step': 2739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:09:57.804873', 'step': 2739, 'epoch': 3} {'type': 'loss', 'content': 0.021923096850514412, 'timestamp': '2025-09-04 04:09:57.823090', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:10:06.199426', 'step': 2740, 'epoch': 3} {'type': 'pplx', 'content': 311.12367018379337, 'timestamp': '2025-09-04 04:10:06.201852', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:06.299389', 'step': 2740, 'epoch': 3} {'type': 'loss', 'content': 0.017065318301320076, 'timestamp': '2025-09-04 04:10:06.320394', 'step': 2741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:06.429479', 'step': 2741, 'epoch': 3} {'type': 'loss', 'content': 0.02739621140062809, 'timestamp': '2025-09-04 04:10:06.449742', 'step': 2742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:06.557790', 'step': 2742, 'epoch': 3} {'type': 'loss', 'content': 0.0004024782683700323, 'timestamp': '2025-09-04 04:10:06.578034', 'step': 2743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:06.684752', 'step': 2743, 'epoch': 3} {'type': 'loss', 'content': 0.03272821381688118, 'timestamp': '2025-09-04 04:10:06.705438', 'step': 2744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 04:10:06.833286', 'step': 2744, 'epoch': 3} {'type': 'loss', 'content': 0.036441512405872345, 'timestamp': '2025-09-04 04:10:06.860279', 'step': 2745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:06.961146', 'step': 2745, 'epoch': 3} {'type': 'loss', 'content': 0.017346149310469627, 'timestamp': '2025-09-04 04:10:06.979983', 'step': 2746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:07.086318', 'step': 2746, 'epoch': 3} {'type': 'loss', 'content': 0.008722832426428795, 'timestamp': '2025-09-04 04:10:07.106265', 'step': 2747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:07.216471', 'step': 2747, 'epoch': 3} {'type': 'loss', 'content': 0.004638466984033585, 'timestamp': '2025-09-04 04:10:07.237228', 'step': 2748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:10:07.320538', 'step': 2748, 'epoch': 3} {'type': 'loss', 'content': 0.0057872445322573185, 'timestamp': '2025-09-04 04:10:07.337391', 'step': 2749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:10:07.448267', 'step': 2749, 'epoch': 3} {'type': 'loss', 'content': 0.0026939285453408957, 'timestamp': '2025-09-04 04:10:07.468552', 'step': 2750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:07.570874', 'step': 2750, 'epoch': 3} {'type': 'loss', 'content': 0.0036715560127049685, 'timestamp': '2025-09-04 04:10:07.589996', 'step': 2751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:10:07.671514', 'step': 2751, 'epoch': 3} {'type': 'loss', 'content': 0.0037178792990744114, 'timestamp': '2025-09-04 04:10:07.686499', 'step': 2752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:10:07.777656', 'step': 2752, 'epoch': 3} {'type': 'loss', 'content': 0.0027517317794263363, 'timestamp': '2025-09-04 04:10:07.796789', 'step': 2753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:07.897199', 'step': 2753, 'epoch': 3} {'type': 'loss', 'content': 0.0021229060366749763, 'timestamp': '2025-09-04 04:10:07.916168', 'step': 2754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:10:07.994227', 'step': 2754, 'epoch': 3} {'type': 'loss', 'content': 0.006144124083220959, 'timestamp': '2025-09-04 04:10:08.008433', 'step': 2755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:08.109566', 'step': 2755, 'epoch': 3} {'type': 'loss', 'content': 0.0041644214652478695, 'timestamp': '2025-09-04 04:10:08.129094', 'step': 2756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:10:08.239640', 'step': 2756, 'epoch': 3} {'type': 'loss', 'content': 0.018796509131789207, 'timestamp': '2025-09-04 04:10:08.262060', 'step': 2757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:10:08.379321', 'step': 2757, 'epoch': 3} {'type': 'loss', 'content': 0.009927745908498764, 'timestamp': '2025-09-04 04:10:08.401456', 'step': 2758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:10:08.511886', 'step': 2758, 'epoch': 3} {'type': 'loss', 'content': 0.0052697621285915375, 'timestamp': '2025-09-04 04:10:08.532136', 'step': 2759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:08.642566', 'step': 2759, 'epoch': 3} {'type': 'loss', 'content': 0.005806296598166227, 'timestamp': '2025-09-04 04:10:08.663642', 'step': 2760, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:10:17.026885', 'step': 2760, 'epoch': 3} {'type': 'pplx', 'content': 312.6903928437137, 'timestamp': '2025-09-04 04:10:17.028993', 'step': 2760, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2760', 'timestamp': '2025-09-04 04:10:17.510313', 'step': 2760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:10:17.606536', 'step': 2760, 'epoch': 3} {'type': 'loss', 'content': 0.03818826749920845, 'timestamp': '2025-09-04 04:10:17.626870', 'step': 2761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:10:17.708673', 'step': 2761, 'epoch': 3} {'type': 'loss', 'content': 0.004727974068373442, 'timestamp': '2025-09-04 04:10:17.723698', 'step': 2762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:17.824531', 'step': 2762, 'epoch': 3} {'type': 'loss', 'content': 0.001716333907097578, 'timestamp': '2025-09-04 04:10:17.843370', 'step': 2763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:10:17.929149', 'step': 2763, 'epoch': 3} {'type': 'loss', 'content': 0.01537768542766571, 'timestamp': '2025-09-04 04:10:17.945052', 'step': 2764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:18.039295', 'step': 2764, 'epoch': 3} {'type': 'loss', 'content': 0.003431823570281267, 'timestamp': '2025-09-04 04:10:18.058557', 'step': 2765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:10:18.152151', 'step': 2765, 'epoch': 3} {'type': 'loss', 'content': 0.01357530988752842, 'timestamp': '2025-09-04 04:10:18.168950', 'step': 2766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:10:18.263993', 'step': 2766, 'epoch': 3} {'type': 'loss', 'content': 0.009554415941238403, 'timestamp': '2025-09-04 04:10:18.281261', 'step': 2767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:18.386404', 'step': 2767, 'epoch': 3} {'type': 'loss', 'content': 0.04511303827166557, 'timestamp': '2025-09-04 04:10:18.406572', 'step': 2768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:18.510539', 'step': 2768, 'epoch': 3} {'type': 'loss', 'content': 0.00565410265699029, 'timestamp': '2025-09-04 04:10:18.532370', 'step': 2769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:18.627990', 'step': 2769, 'epoch': 3} {'type': 'loss', 'content': 0.01116864662617445, 'timestamp': '2025-09-04 04:10:18.645617', 'step': 2770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:10:18.717805', 'step': 2770, 'epoch': 3} {'type': 'loss', 'content': 0.024722347036004066, 'timestamp': '2025-09-04 04:10:18.730823', 'step': 2771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:18.831877', 'step': 2771, 'epoch': 3} {'type': 'loss', 'content': 0.028597375378012657, 'timestamp': '2025-09-04 04:10:18.851318', 'step': 2772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:18.929289', 'step': 2772, 'epoch': 3} {'type': 'loss', 'content': 0.0017174814129248261, 'timestamp': '2025-09-04 04:10:18.944338', 'step': 2773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:19.054046', 'step': 2773, 'epoch': 3} {'type': 'loss', 'content': 0.018303746357560158, 'timestamp': '2025-09-04 04:10:19.074366', 'step': 2774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:19.182654', 'step': 2774, 'epoch': 3} {'type': 'loss', 'content': 0.005763411987572908, 'timestamp': '2025-09-04 04:10:19.202145', 'step': 2775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:10:19.290334', 'step': 2775, 'epoch': 3} {'type': 'loss', 'content': 0.08446403592824936, 'timestamp': '2025-09-04 04:10:19.306205', 'step': 2776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:19.405065', 'step': 2776, 'epoch': 3} {'type': 'loss', 'content': 0.02213035710155964, 'timestamp': '2025-09-04 04:10:19.425185', 'step': 2777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:19.528569', 'step': 2777, 'epoch': 3} {'type': 'loss', 'content': 0.00788130797445774, 'timestamp': '2025-09-04 04:10:19.547055', 'step': 2778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:10:19.630529', 'step': 2778, 'epoch': 3} {'type': 'loss', 'content': 0.002031184732913971, 'timestamp': '2025-09-04 04:10:19.645671', 'step': 2779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:19.755409', 'step': 2779, 'epoch': 3} {'type': 'loss', 'content': 0.0032860611099749804, 'timestamp': '2025-09-04 04:10:19.776604', 'step': 2780, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:10:28.279651', 'step': 2780, 'epoch': 3} {'type': 'pplx', 'content': 315.0957572527015, 'timestamp': '2025-09-04 04:10:28.282924', 'step': 2780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:28.358024', 'step': 2780, 'epoch': 3} {'type': 'loss', 'content': 0.020035451278090477, 'timestamp': '2025-09-04 04:10:28.373018', 'step': 2781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:28.481511', 'step': 2781, 'epoch': 3} {'type': 'loss', 'content': 0.019282890483736992, 'timestamp': '2025-09-04 04:10:28.501835', 'step': 2782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:28.605645', 'step': 2782, 'epoch': 3} {'type': 'loss', 'content': 0.0053369090892374516, 'timestamp': '2025-09-04 04:10:28.624855', 'step': 2783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:28.702196', 'step': 2783, 'epoch': 3} {'type': 'loss', 'content': 0.003618256188929081, 'timestamp': '2025-09-04 04:10:28.717006', 'step': 2784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:10:28.833789', 'step': 2784, 'epoch': 3} {'type': 'loss', 'content': 0.03532740846276283, 'timestamp': '2025-09-04 04:10:28.857626', 'step': 2785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:10:28.957073', 'step': 2785, 'epoch': 3} {'type': 'loss', 'content': 0.014763697050511837, 'timestamp': '2025-09-04 04:10:28.975683', 'step': 2786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:29.082579', 'step': 2786, 'epoch': 3} {'type': 'loss', 'content': 0.05285456404089928, 'timestamp': '2025-09-04 04:10:29.102524', 'step': 2787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:29.209574', 'step': 2787, 'epoch': 3} {'type': 'loss', 'content': 0.01331684272736311, 'timestamp': '2025-09-04 04:10:29.230075', 'step': 2788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:29.306219', 'step': 2788, 'epoch': 3} {'type': 'loss', 'content': 0.03862687200307846, 'timestamp': '2025-09-04 04:10:29.321566', 'step': 2789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:10:29.405360', 'step': 2789, 'epoch': 3} {'type': 'loss', 'content': 0.0027003700379282236, 'timestamp': '2025-09-04 04:10:29.420505', 'step': 2790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:10:29.513961', 'step': 2790, 'epoch': 3} {'type': 'loss', 'content': 0.024312736466526985, 'timestamp': '2025-09-04 04:10:29.531382', 'step': 2791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:10:29.641310', 'step': 2791, 'epoch': 3} {'type': 'loss', 'content': 0.009408136829733849, 'timestamp': '2025-09-04 04:10:29.662400', 'step': 2792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:10:29.770535', 'step': 2792, 'epoch': 3} {'type': 'loss', 'content': 0.008958464488387108, 'timestamp': '2025-09-04 04:10:29.793068', 'step': 2793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:29.900709', 'step': 2793, 'epoch': 3} {'type': 'loss', 'content': 0.0011589645873755217, 'timestamp': '2025-09-04 04:10:29.920749', 'step': 2794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:30.016347', 'step': 2794, 'epoch': 3} {'type': 'loss', 'content': 0.00341598829254508, 'timestamp': '2025-09-04 04:10:30.033834', 'step': 2795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:30.111178', 'step': 2795, 'epoch': 3} {'type': 'loss', 'content': 0.012302583083510399, 'timestamp': '2025-09-04 04:10:30.125938', 'step': 2796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:30.224337', 'step': 2796, 'epoch': 3} {'type': 'loss', 'content': 0.017612578347325325, 'timestamp': '2025-09-04 04:10:30.245067', 'step': 2797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:30.339629', 'step': 2797, 'epoch': 3} {'type': 'loss', 'content': 0.007235830184072256, 'timestamp': '2025-09-04 04:10:30.357101', 'step': 2798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:10:30.450764', 'step': 2798, 'epoch': 3} {'type': 'loss', 'content': 0.0061172679997980595, 'timestamp': '2025-09-04 04:10:30.468141', 'step': 2799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:10:30.579493', 'step': 2799, 'epoch': 3} {'type': 'loss', 'content': 0.001860892865806818, 'timestamp': '2025-09-04 04:10:30.600698', 'step': 2800, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:10:39.011398', 'step': 2800, 'epoch': 3} {'type': 'pplx', 'content': 318.34111266257355, 'timestamp': '2025-09-04 04:10:39.013794', 'step': 2800, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2800', 'timestamp': '2025-09-04 04:10:39.393827', 'step': 2800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:10:39.474968', 'step': 2800, 'epoch': 3} {'type': 'loss', 'content': 0.10843028128147125, 'timestamp': '2025-09-04 04:10:39.491863', 'step': 2801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:39.595028', 'step': 2801, 'epoch': 3} {'type': 'loss', 'content': 0.002566551323980093, 'timestamp': '2025-09-04 04:10:39.614327', 'step': 2802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:10:39.708227', 'step': 2802, 'epoch': 3} {'type': 'loss', 'content': 0.010621008463203907, 'timestamp': '2025-09-04 04:10:39.725450', 'step': 2803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:10:39.803350', 'step': 2803, 'epoch': 3} {'type': 'loss', 'content': 0.015502018854022026, 'timestamp': '2025-09-04 04:10:39.818407', 'step': 2804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:39.918618', 'step': 2804, 'epoch': 3} {'type': 'loss', 'content': 0.00420707743614912, 'timestamp': '2025-09-04 04:10:39.939782', 'step': 2805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:40.032227', 'step': 2805, 'epoch': 3} {'type': 'loss', 'content': 0.02263396605849266, 'timestamp': '2025-09-04 04:10:40.046253', 'step': 2806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:10:40.124234', 'step': 2806, 'epoch': 3} {'type': 'loss', 'content': 0.0052959551103413105, 'timestamp': '2025-09-04 04:10:40.138380', 'step': 2807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:40.242077', 'step': 2807, 'epoch': 3} {'type': 'loss', 'content': 0.015647169202566147, 'timestamp': '2025-09-04 04:10:40.262075', 'step': 2808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:40.369856', 'step': 2808, 'epoch': 3} {'type': 'loss', 'content': 0.0023905981797724962, 'timestamp': '2025-09-04 04:10:40.392110', 'step': 2809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:40.500448', 'step': 2809, 'epoch': 3} {'type': 'loss', 'content': 0.002249652286991477, 'timestamp': '2025-09-04 04:10:40.520444', 'step': 2810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:40.624725', 'step': 2810, 'epoch': 3} {'type': 'loss', 'content': 0.006803617812693119, 'timestamp': '2025-09-04 04:10:40.644151', 'step': 2811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:40.748833', 'step': 2811, 'epoch': 3} {'type': 'loss', 'content': 0.0021052875090390444, 'timestamp': '2025-09-04 04:10:40.768933', 'step': 2812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:40.869181', 'step': 2812, 'epoch': 3} {'type': 'loss', 'content': 0.004011242184787989, 'timestamp': '2025-09-04 04:10:40.890221', 'step': 2813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 04:10:41.095710', 'step': 2813, 'epoch': 3} {'type': 'loss', 'content': 0.0032255747355520725, 'timestamp': '2025-09-04 04:10:41.134881', 'step': 2814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:41.212690', 'step': 2814, 'epoch': 3} {'type': 'loss', 'content': 0.0025714444927871227, 'timestamp': '2025-09-04 04:10:41.226621', 'step': 2815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:41.327667', 'step': 2815, 'epoch': 3} {'type': 'loss', 'content': 0.013107407838106155, 'timestamp': '2025-09-04 04:10:41.347348', 'step': 2816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:10:41.444093', 'step': 2816, 'epoch': 3} {'type': 'loss', 'content': 0.0025292744394391775, 'timestamp': '2025-09-04 04:10:41.464580', 'step': 2817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:41.566996', 'step': 2817, 'epoch': 3} {'type': 'loss', 'content': 0.019604351371526718, 'timestamp': '2025-09-04 04:10:41.586193', 'step': 2818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:10:41.685731', 'step': 2818, 'epoch': 3} {'type': 'loss', 'content': 0.006207880564033985, 'timestamp': '2025-09-04 04:10:41.704494', 'step': 2819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:10:41.804185', 'step': 2819, 'epoch': 3} {'type': 'loss', 'content': 0.0022927229292690754, 'timestamp': '2025-09-04 04:10:41.823554', 'step': 2820, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:10:50.340911', 'step': 2820, 'epoch': 3} {'type': 'pplx', 'content': 319.52629439814444, 'timestamp': '2025-09-04 04:10:50.346919', 'step': 2820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:50.421114', 'step': 2820, 'epoch': 3} {'type': 'loss', 'content': 0.0014415581244975328, 'timestamp': '2025-09-04 04:10:50.436450', 'step': 2821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:50.513386', 'step': 2821, 'epoch': 3} {'type': 'loss', 'content': 0.039860036224126816, 'timestamp': '2025-09-04 04:10:50.527474', 'step': 2822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:10:50.638203', 'step': 2822, 'epoch': 3} {'type': 'loss', 'content': 0.04684140905737877, 'timestamp': '2025-09-04 04:10:50.658808', 'step': 2823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:10:50.751914', 'step': 2823, 'epoch': 3} {'type': 'loss', 'content': 0.005943602416664362, 'timestamp': '2025-09-04 04:10:50.769856', 'step': 2824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:50.868293', 'step': 2824, 'epoch': 3} {'type': 'loss', 'content': 0.006049733608961105, 'timestamp': '2025-09-04 04:10:50.888798', 'step': 2825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:50.988731', 'step': 2825, 'epoch': 3} {'type': 'loss', 'content': 0.0014789201086387038, 'timestamp': '2025-09-04 04:10:51.007670', 'step': 2826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 04:10:51.233431', 'step': 2826, 'epoch': 3} {'type': 'loss', 'content': 0.029814256355166435, 'timestamp': '2025-09-04 04:10:51.275647', 'step': 2827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:10:51.380010', 'step': 2827, 'epoch': 3} {'type': 'loss', 'content': 0.004776179324835539, 'timestamp': '2025-09-04 04:10:51.400069', 'step': 2828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:10:51.476052', 'step': 2828, 'epoch': 3} {'type': 'loss', 'content': 0.025096198543906212, 'timestamp': '2025-09-04 04:10:51.491170', 'step': 2829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1200], 'flops': 24000145761984.0}, 'timestamp': '2025-09-04 04:10:51.666720', 'step': 2829, 'epoch': 3} {'type': 'loss', 'content': 0.0008539275149814785, 'timestamp': '2025-09-04 04:10:51.699819', 'step': 2830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:51.794605', 'step': 2830, 'epoch': 3} {'type': 'loss', 'content': 0.039075467735528946, 'timestamp': '2025-09-04 04:10:51.812176', 'step': 2831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:10:51.927654', 'step': 2831, 'epoch': 3} {'type': 'loss', 'content': 0.002711005974560976, 'timestamp': '2025-09-04 04:10:51.948514', 'step': 2832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:10:52.054122', 'step': 2832, 'epoch': 3} {'type': 'loss', 'content': 0.01790589839220047, 'timestamp': '2025-09-04 04:10:52.076419', 'step': 2833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:52.177333', 'step': 2833, 'epoch': 3} {'type': 'loss', 'content': 0.06275072693824768, 'timestamp': '2025-09-04 04:10:52.196207', 'step': 2834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:52.300946', 'step': 2834, 'epoch': 3} {'type': 'loss', 'content': 0.004898981656879187, 'timestamp': '2025-09-04 04:10:52.320225', 'step': 2835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:10:52.532531', 'step': 2835, 'epoch': 3} {'type': 'loss', 'content': 0.009175102226436138, 'timestamp': '2025-09-04 04:10:52.572401', 'step': 2836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:10:52.665860', 'step': 2836, 'epoch': 3} {'type': 'loss', 'content': 0.009001928381621838, 'timestamp': '2025-09-04 04:10:52.685175', 'step': 2837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:10:52.768630', 'step': 2837, 'epoch': 3} {'type': 'loss', 'content': 0.0046532354317605495, 'timestamp': '2025-09-04 04:10:52.783757', 'step': 2838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:10:52.887118', 'step': 2838, 'epoch': 3} {'type': 'loss', 'content': 0.010329126380383968, 'timestamp': '2025-09-04 04:10:52.906475', 'step': 2839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:10:53.005841', 'step': 2839, 'epoch': 3} {'type': 'loss', 'content': 0.010225264355540276, 'timestamp': '2025-09-04 04:10:53.025440', 'step': 2840, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:01.520893', 'step': 2840, 'epoch': 3} {'type': 'pplx', 'content': 325.51343438767867, 'timestamp': '2025-09-04 04:11:01.522859', 'step': 2840, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2840', 'timestamp': '2025-09-04 04:11:01.898939', 'step': 2840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:02.001221', 'step': 2840, 'epoch': 3} {'type': 'loss', 'content': 0.01796550862491131, 'timestamp': '2025-09-04 04:11:02.022491', 'step': 2841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:11:02.125380', 'step': 2841, 'epoch': 3} {'type': 'loss', 'content': 0.005674062762409449, 'timestamp': '2025-09-04 04:11:02.144648', 'step': 2842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:11:02.266925', 'step': 2842, 'epoch': 3} {'type': 'loss', 'content': 0.01859263703227043, 'timestamp': '2025-09-04 04:11:02.290243', 'step': 2843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:02.397530', 'step': 2843, 'epoch': 3} {'type': 'loss', 'content': 0.0015665870159864426, 'timestamp': '2025-09-04 04:11:02.418251', 'step': 2844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:11:02.527583', 'step': 2844, 'epoch': 3} {'type': 'loss', 'content': 0.012087870389223099, 'timestamp': '2025-09-04 04:11:02.550254', 'step': 2845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:02.659541', 'step': 2845, 'epoch': 3} {'type': 'loss', 'content': 0.020001424476504326, 'timestamp': '2025-09-04 04:11:02.679920', 'step': 2846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:11:02.756649', 'step': 2846, 'epoch': 3} {'type': 'loss', 'content': 0.028880124911665916, 'timestamp': '2025-09-04 04:11:02.770460', 'step': 2847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 04:11:02.907840', 'step': 2847, 'epoch': 3} {'type': 'loss', 'content': 0.05806123465299606, 'timestamp': '2025-09-04 04:11:02.934843', 'step': 2848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:11:03.012846', 'step': 2848, 'epoch': 3} {'type': 'loss', 'content': 0.024739542976021767, 'timestamp': '2025-09-04 04:11:03.028248', 'step': 2849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:03.132752', 'step': 2849, 'epoch': 3} {'type': 'loss', 'content': 0.01195996068418026, 'timestamp': '2025-09-04 04:11:03.151984', 'step': 2850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:03.252141', 'step': 2850, 'epoch': 3} {'type': 'loss', 'content': 0.002954959636554122, 'timestamp': '2025-09-04 04:11:03.271134', 'step': 2851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:03.381295', 'step': 2851, 'epoch': 3} {'type': 'loss', 'content': 0.03290526196360588, 'timestamp': '2025-09-04 04:11:03.402381', 'step': 2852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:03.503914', 'step': 2852, 'epoch': 3} {'type': 'loss', 'content': 0.0018533534603193402, 'timestamp': '2025-09-04 04:11:03.525056', 'step': 2853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:03.629515', 'step': 2853, 'epoch': 3} {'type': 'loss', 'content': 0.006066088564693928, 'timestamp': '2025-09-04 04:11:03.648528', 'step': 2854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 04:11:03.822330', 'step': 2854, 'epoch': 3} {'type': 'loss', 'content': 0.0006023372989147902, 'timestamp': '2025-09-04 04:11:03.855018', 'step': 2855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:11:03.941618', 'step': 2855, 'epoch': 3} {'type': 'loss', 'content': 0.012206432409584522, 'timestamp': '2025-09-04 04:11:03.957991', 'step': 2856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:04.049387', 'step': 2856, 'epoch': 3} {'type': 'loss', 'content': 0.011448818258941174, 'timestamp': '2025-09-04 04:11:04.068480', 'step': 2857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:04.161673', 'step': 2857, 'epoch': 3} {'type': 'loss', 'content': 0.008118141442537308, 'timestamp': '2025-09-04 04:11:04.178779', 'step': 2858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:04.271839', 'step': 2858, 'epoch': 3} {'type': 'loss', 'content': 0.008240415714681149, 'timestamp': '2025-09-04 04:11:04.288945', 'step': 2859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:04.389798', 'step': 2859, 'epoch': 3} {'type': 'loss', 'content': 0.0023680399172008038, 'timestamp': '2025-09-04 04:11:04.409541', 'step': 2860, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:12.908013', 'step': 2860, 'epoch': 3} {'type': 'pplx', 'content': 331.32267155954526, 'timestamp': '2025-09-04 04:11:12.910564', 'step': 2860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:13.014712', 'step': 2860, 'epoch': 3} {'type': 'loss', 'content': 0.0046889204531908035, 'timestamp': '2025-09-04 04:11:13.037029', 'step': 2861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:13.136629', 'step': 2861, 'epoch': 3} {'type': 'loss', 'content': 0.016678936779499054, 'timestamp': '2025-09-04 04:11:13.155187', 'step': 2862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:13.255837', 'step': 2862, 'epoch': 3} {'type': 'loss', 'content': 0.0009306335123255849, 'timestamp': '2025-09-04 04:11:13.274539', 'step': 2863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:11:13.384828', 'step': 2863, 'epoch': 3} {'type': 'loss', 'content': 0.003980133216828108, 'timestamp': '2025-09-04 04:11:13.406117', 'step': 2864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:13.501527', 'step': 2864, 'epoch': 3} {'type': 'loss', 'content': 0.0005760484491474926, 'timestamp': '2025-09-04 04:11:13.520434', 'step': 2865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:11:13.631198', 'step': 2865, 'epoch': 3} {'type': 'loss', 'content': 0.0011027660220861435, 'timestamp': '2025-09-04 04:11:13.651783', 'step': 2866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:11:13.735521', 'step': 2866, 'epoch': 3} {'type': 'loss', 'content': 0.009353280998766422, 'timestamp': '2025-09-04 04:11:13.750541', 'step': 2867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:11:13.852996', 'step': 2867, 'epoch': 3} {'type': 'loss', 'content': 0.00644304882735014, 'timestamp': '2025-09-04 04:11:13.873074', 'step': 2868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:11:14.072568', 'step': 2868, 'epoch': 3} {'type': 'loss', 'content': 0.026496384292840958, 'timestamp': '2025-09-04 04:11:14.115279', 'step': 2869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:11:14.233382', 'step': 2869, 'epoch': 3} {'type': 'loss', 'content': 0.003224861342459917, 'timestamp': '2025-09-04 04:11:14.255497', 'step': 2870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:14.351529', 'step': 2870, 'epoch': 3} {'type': 'loss', 'content': 0.011617367155849934, 'timestamp': '2025-09-04 04:11:14.368918', 'step': 2871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:14.476955', 'step': 2871, 'epoch': 3} {'type': 'loss', 'content': 0.018545642495155334, 'timestamp': '2025-09-04 04:11:14.498148', 'step': 2872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:14.601865', 'step': 2872, 'epoch': 3} {'type': 'loss', 'content': 0.0030028163455426693, 'timestamp': '2025-09-04 04:11:14.623707', 'step': 2873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:11:14.759811', 'step': 2873, 'epoch': 3} {'type': 'loss', 'content': 0.0005331755965016782, 'timestamp': '2025-09-04 04:11:14.785767', 'step': 2874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:11:14.871281', 'step': 2874, 'epoch': 3} {'type': 'loss', 'content': 0.021123895421624184, 'timestamp': '2025-09-04 04:11:14.886891', 'step': 2875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:11:14.977528', 'step': 2875, 'epoch': 3} {'type': 'loss', 'content': 0.001793418894521892, 'timestamp': '2025-09-04 04:11:14.995043', 'step': 2876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:15.091817', 'step': 2876, 'epoch': 3} {'type': 'loss', 'content': 0.005147572606801987, 'timestamp': '2025-09-04 04:11:15.112275', 'step': 2877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:11:15.195551', 'step': 2877, 'epoch': 3} {'type': 'loss', 'content': 0.042930182069540024, 'timestamp': '2025-09-04 04:11:15.210752', 'step': 2878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:15.303265', 'step': 2878, 'epoch': 3} {'type': 'loss', 'content': 0.008315377868711948, 'timestamp': '2025-09-04 04:11:15.320401', 'step': 2879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:11:15.391797', 'step': 2879, 'epoch': 3} {'type': 'loss', 'content': 0.0415649451315403, 'timestamp': '2025-09-04 04:11:15.405539', 'step': 2880, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:23.878254', 'step': 2880, 'epoch': 3} {'type': 'pplx', 'content': 334.28353166069746, 'timestamp': '2025-09-04 04:11:23.880247', 'step': 2880, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2880', 'timestamp': '2025-09-04 04:11:24.404237', 'step': 2880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:11:24.478633', 'step': 2880, 'epoch': 3} {'type': 'loss', 'content': 0.015389709733426571, 'timestamp': '2025-09-04 04:11:24.493617', 'step': 2881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:11:24.570800', 'step': 2881, 'epoch': 3} {'type': 'loss', 'content': 0.012384703382849693, 'timestamp': '2025-09-04 04:11:24.584952', 'step': 2882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:24.680140', 'step': 2882, 'epoch': 3} {'type': 'loss', 'content': 0.0006399277481250465, 'timestamp': '2025-09-04 04:11:24.697690', 'step': 2883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:24.791237', 'step': 2883, 'epoch': 3} {'type': 'loss', 'content': 0.020068077370524406, 'timestamp': '2025-09-04 04:11:24.809296', 'step': 2884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:24.906372', 'step': 2884, 'epoch': 3} {'type': 'loss', 'content': 0.006495574954897165, 'timestamp': '2025-09-04 04:11:24.926876', 'step': 2885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:11:25.043941', 'step': 2885, 'epoch': 3} {'type': 'loss', 'content': 0.03137728571891785, 'timestamp': '2025-09-04 04:11:25.066254', 'step': 2886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:25.159725', 'step': 2886, 'epoch': 3} {'type': 'loss', 'content': 0.01227316539734602, 'timestamp': '2025-09-04 04:11:25.177117', 'step': 2887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 04:11:25.315117', 'step': 2887, 'epoch': 3} {'type': 'loss', 'content': 0.0028199530206620693, 'timestamp': '2025-09-04 04:11:25.342043', 'step': 2888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:25.432237', 'step': 2888, 'epoch': 3} {'type': 'loss', 'content': 0.009654730558395386, 'timestamp': '2025-09-04 04:11:25.451127', 'step': 2889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:25.547980', 'step': 2889, 'epoch': 3} {'type': 'loss', 'content': 0.02699156291782856, 'timestamp': '2025-09-04 04:11:25.565553', 'step': 2890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:25.666229', 'step': 2890, 'epoch': 3} {'type': 'loss', 'content': 0.014877088367938995, 'timestamp': '2025-09-04 04:11:25.684974', 'step': 2891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:11:25.764195', 'step': 2891, 'epoch': 3} {'type': 'loss', 'content': 0.041199635714292526, 'timestamp': '2025-09-04 04:11:25.779178', 'step': 2892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:25.877779', 'step': 2892, 'epoch': 3} {'type': 'loss', 'content': 0.04918312653899193, 'timestamp': '2025-09-04 04:11:25.898518', 'step': 2893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:11:25.969548', 'step': 2893, 'epoch': 3} {'type': 'loss', 'content': 0.04640977457165718, 'timestamp': '2025-09-04 04:11:25.982360', 'step': 2894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:26.091334', 'step': 2894, 'epoch': 3} {'type': 'loss', 'content': 0.011765974573791027, 'timestamp': '2025-09-04 04:11:26.111741', 'step': 2895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:26.211303', 'step': 2895, 'epoch': 3} {'type': 'loss', 'content': 0.004320243373513222, 'timestamp': '2025-09-04 04:11:26.230665', 'step': 2896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:11:26.313778', 'step': 2896, 'epoch': 3} {'type': 'loss', 'content': 0.003690729383379221, 'timestamp': '2025-09-04 04:11:26.330762', 'step': 2897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:26.427045', 'step': 2897, 'epoch': 3} {'type': 'loss', 'content': 0.005680757109075785, 'timestamp': '2025-09-04 04:11:26.444663', 'step': 2898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:26.545200', 'step': 2898, 'epoch': 3} {'type': 'loss', 'content': 0.0033540872391313314, 'timestamp': '2025-09-04 04:11:26.564186', 'step': 2899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:11:26.679234', 'step': 2899, 'epoch': 3} {'type': 'loss', 'content': 0.008547638542950153, 'timestamp': '2025-09-04 04:11:26.700432', 'step': 2900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:35.169803', 'step': 2900, 'epoch': 3} {'type': 'pplx', 'content': 331.12824914190674, 'timestamp': '2025-09-04 04:11:35.172146', 'step': 2900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:35.267283', 'step': 2900, 'epoch': 3} {'type': 'loss', 'content': 0.0010944758541882038, 'timestamp': '2025-09-04 04:11:35.287673', 'step': 2901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:11:35.363842', 'step': 2901, 'epoch': 3} {'type': 'loss', 'content': 0.016356654465198517, 'timestamp': '2025-09-04 04:11:35.377690', 'step': 2902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:35.471661', 'step': 2902, 'epoch': 3} {'type': 'loss', 'content': 0.0011579522397369146, 'timestamp': '2025-09-04 04:11:35.489033', 'step': 2903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:35.600157', 'step': 2903, 'epoch': 3} {'type': 'loss', 'content': 0.0319959782063961, 'timestamp': '2025-09-04 04:11:35.620991', 'step': 2904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:35.721187', 'step': 2904, 'epoch': 3} {'type': 'loss', 'content': 0.000950302230194211, 'timestamp': '2025-09-04 04:11:35.742430', 'step': 2905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:35.848848', 'step': 2905, 'epoch': 3} {'type': 'loss', 'content': 0.001381176058202982, 'timestamp': '2025-09-04 04:11:35.868867', 'step': 2906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:11:35.958637', 'step': 2906, 'epoch': 3} {'type': 'loss', 'content': 0.0023815217427909374, 'timestamp': '2025-09-04 04:11:35.975448', 'step': 2907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:36.069679', 'step': 2907, 'epoch': 3} {'type': 'loss', 'content': 0.01757667027413845, 'timestamp': '2025-09-04 04:11:36.087672', 'step': 2908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:36.188807', 'step': 2908, 'epoch': 3} {'type': 'loss', 'content': 0.0278801117092371, 'timestamp': '2025-09-04 04:11:36.210007', 'step': 2909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:11:36.285728', 'step': 2909, 'epoch': 3} {'type': 'loss', 'content': 0.01353493519127369, 'timestamp': '2025-09-04 04:11:36.299222', 'step': 2910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:11:36.401642', 'step': 2910, 'epoch': 3} {'type': 'loss', 'content': 0.04371942579746246, 'timestamp': '2025-09-04 04:11:36.420836', 'step': 2911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:36.521093', 'step': 2911, 'epoch': 3} {'type': 'loss', 'content': 0.0061318958178162575, 'timestamp': '2025-09-04 04:11:36.540465', 'step': 2912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:11:36.624627', 'step': 2912, 'epoch': 3} {'type': 'loss', 'content': 0.057585082948207855, 'timestamp': '2025-09-04 04:11:36.641737', 'step': 2913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:36.745445', 'step': 2913, 'epoch': 3} {'type': 'loss', 'content': 0.0021131334360688925, 'timestamp': '2025-09-04 04:11:36.763965', 'step': 2914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:11:36.848566', 'step': 2914, 'epoch': 3} {'type': 'loss', 'content': 0.01308556366711855, 'timestamp': '2025-09-04 04:11:36.863954', 'step': 2915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:11:36.987369', 'step': 2915, 'epoch': 3} {'type': 'loss', 'content': 0.0034837471321225166, 'timestamp': '2025-09-04 04:11:37.011142', 'step': 2916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:37.105198', 'step': 2916, 'epoch': 3} {'type': 'loss', 'content': 0.0005953749641776085, 'timestamp': '2025-09-04 04:11:37.124172', 'step': 2917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:37.229787', 'step': 2917, 'epoch': 3} {'type': 'loss', 'content': 0.005551936570554972, 'timestamp': '2025-09-04 04:11:37.249736', 'step': 2918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:37.356024', 'step': 2918, 'epoch': 3} {'type': 'loss', 'content': 0.008271034806966782, 'timestamp': '2025-09-04 04:11:37.376070', 'step': 2919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:11:37.510820', 'step': 2919, 'epoch': 3} {'type': 'loss', 'content': 0.01085783913731575, 'timestamp': '2025-09-04 04:11:37.537414', 'step': 2920, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:46.026493', 'step': 2920, 'epoch': 3} {'type': 'pplx', 'content': 321.86002245296936, 'timestamp': '2025-09-04 04:11:46.028695', 'step': 2920, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2920', 'timestamp': '2025-09-04 04:11:46.382176', 'step': 2920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:11:46.500520', 'step': 2920, 'epoch': 3} {'type': 'loss', 'content': 0.0059812976978719234, 'timestamp': '2025-09-04 04:11:46.525777', 'step': 2921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:46.631419', 'step': 2921, 'epoch': 3} {'type': 'loss', 'content': 0.0009687381098046899, 'timestamp': '2025-09-04 04:11:46.648895', 'step': 2922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:11:46.784826', 'step': 2922, 'epoch': 3} {'type': 'loss', 'content': 0.005854323972016573, 'timestamp': '2025-09-04 04:11:46.810927', 'step': 2923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:46.910290', 'step': 2923, 'epoch': 3} {'type': 'loss', 'content': 0.023817606270313263, 'timestamp': '2025-09-04 04:11:46.929762', 'step': 2924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:11:47.036718', 'step': 2924, 'epoch': 3} {'type': 'loss', 'content': 0.006292128004133701, 'timestamp': '2025-09-04 04:11:47.059290', 'step': 2925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:11:47.161720', 'step': 2925, 'epoch': 3} {'type': 'loss', 'content': 0.001953437924385071, 'timestamp': '2025-09-04 04:11:47.181032', 'step': 2926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:11:47.257446', 'step': 2926, 'epoch': 3} {'type': 'loss', 'content': 0.00025311694480478764, 'timestamp': '2025-09-04 04:11:47.271151', 'step': 2927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:47.364263', 'step': 2927, 'epoch': 3} {'type': 'loss', 'content': 0.01116481889039278, 'timestamp': '2025-09-04 04:11:47.382181', 'step': 2928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:11:47.474507', 'step': 2928, 'epoch': 3} {'type': 'loss', 'content': 0.016344843432307243, 'timestamp': '2025-09-04 04:11:47.493520', 'step': 2929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:11:47.587785', 'step': 2929, 'epoch': 3} {'type': 'loss', 'content': 0.0564613938331604, 'timestamp': '2025-09-04 04:11:47.605044', 'step': 2930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:47.707478', 'step': 2930, 'epoch': 3} {'type': 'loss', 'content': 0.006602128501981497, 'timestamp': '2025-09-04 04:11:47.726448', 'step': 2931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:11:47.804329', 'step': 2931, 'epoch': 3} {'type': 'loss', 'content': 0.00032434234162792563, 'timestamp': '2025-09-04 04:11:47.819267', 'step': 2932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:11:47.911829', 'step': 2932, 'epoch': 3} {'type': 'loss', 'content': 0.011728801764547825, 'timestamp': '2025-09-04 04:11:47.931177', 'step': 2933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:11:48.008311', 'step': 2933, 'epoch': 3} {'type': 'loss', 'content': 0.03661501035094261, 'timestamp': '2025-09-04 04:11:48.022294', 'step': 2934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:48.126565', 'step': 2934, 'epoch': 3} {'type': 'loss', 'content': 0.005506326910108328, 'timestamp': '2025-09-04 04:11:48.145965', 'step': 2935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:11:48.248985', 'step': 2935, 'epoch': 3} {'type': 'loss', 'content': 0.007380081806331873, 'timestamp': '2025-09-04 04:11:48.268936', 'step': 2936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:11:48.356618', 'step': 2936, 'epoch': 3} {'type': 'loss', 'content': 0.023367907851934433, 'timestamp': '2025-09-04 04:11:48.375066', 'step': 2937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:11:48.484782', 'step': 2937, 'epoch': 3} {'type': 'loss', 'content': 0.041043538600206375, 'timestamp': '2025-09-04 04:11:48.505538', 'step': 2938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:48.607041', 'step': 2938, 'epoch': 3} {'type': 'loss', 'content': 0.014860640279948711, 'timestamp': '2025-09-04 04:11:48.625778', 'step': 2939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:11:48.712747', 'step': 2939, 'epoch': 3} {'type': 'loss', 'content': 0.05614135414361954, 'timestamp': '2025-09-04 04:11:48.729282', 'step': 2940, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:11:57.219177', 'step': 2940, 'epoch': 3} {'type': 'pplx', 'content': 315.5265659498303, 'timestamp': '2025-09-04 04:11:57.221252', 'step': 2940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:11:57.296365', 'step': 2940, 'epoch': 3} {'type': 'loss', 'content': 0.004779836628586054, 'timestamp': '2025-09-04 04:11:57.311775', 'step': 2941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:11:57.386172', 'step': 2941, 'epoch': 3} {'type': 'loss', 'content': 0.024101873859763145, 'timestamp': '2025-09-04 04:11:57.399796', 'step': 2942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:57.506945', 'step': 2942, 'epoch': 3} {'type': 'loss', 'content': 0.011341488920152187, 'timestamp': '2025-09-04 04:11:57.527061', 'step': 2943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:11:57.637526', 'step': 2943, 'epoch': 3} {'type': 'loss', 'content': 0.022627878934144974, 'timestamp': '2025-09-04 04:11:57.658821', 'step': 2944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:57.756279', 'step': 2944, 'epoch': 3} {'type': 'loss', 'content': 0.01583726704120636, 'timestamp': '2025-09-04 04:11:57.777101', 'step': 2945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:11:57.912462', 'step': 2945, 'epoch': 3} {'type': 'loss', 'content': 0.011284503154456615, 'timestamp': '2025-09-04 04:11:57.938532', 'step': 2946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:58.041921', 'step': 2946, 'epoch': 3} {'type': 'loss', 'content': 0.01882072165608406, 'timestamp': '2025-09-04 04:11:58.061301', 'step': 2947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:11:58.145850', 'step': 2947, 'epoch': 3} {'type': 'loss', 'content': 0.01440991461277008, 'timestamp': '2025-09-04 04:11:58.159707', 'step': 2948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:11:58.256671', 'step': 2948, 'epoch': 3} {'type': 'loss', 'content': 0.026601549237966537, 'timestamp': '2025-09-04 04:11:58.277139', 'step': 2949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:11:58.380822', 'step': 2949, 'epoch': 3} {'type': 'loss', 'content': 0.03433069586753845, 'timestamp': '2025-09-04 04:11:58.400182', 'step': 2950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:11:58.476174', 'step': 2950, 'epoch': 3} {'type': 'loss', 'content': 0.0034103342331945896, 'timestamp': '2025-09-04 04:11:58.489905', 'step': 2951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:11:58.572803', 'step': 2951, 'epoch': 3} {'type': 'loss', 'content': 0.005983210634440184, 'timestamp': '2025-09-04 04:11:58.588627', 'step': 2952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:58.690559', 'step': 2952, 'epoch': 3} {'type': 'loss', 'content': 0.016223527491092682, 'timestamp': '2025-09-04 04:11:58.711211', 'step': 2953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:11:58.798428', 'step': 2953, 'epoch': 3} {'type': 'loss', 'content': 0.005183096043765545, 'timestamp': '2025-09-04 04:11:58.814156', 'step': 2954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:11:58.900792', 'step': 2954, 'epoch': 3} {'type': 'loss', 'content': 0.041471634060144424, 'timestamp': '2025-09-04 04:11:58.916498', 'step': 2955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:11:59.026318', 'step': 2955, 'epoch': 3} {'type': 'loss', 'content': 0.0007432362181134522, 'timestamp': '2025-09-04 04:11:59.046046', 'step': 2956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:11:59.165833', 'step': 2956, 'epoch': 3} {'type': 'loss', 'content': 0.010432606562972069, 'timestamp': '2025-09-04 04:11:59.191418', 'step': 2957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:11:59.270172', 'step': 2957, 'epoch': 3} {'type': 'loss', 'content': 0.015087028034031391, 'timestamp': '2025-09-04 04:11:59.284449', 'step': 2958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:11:59.392745', 'step': 2958, 'epoch': 3} {'type': 'loss', 'content': 0.0005746278329752386, 'timestamp': '2025-09-04 04:11:59.412683', 'step': 2959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 04:11:59.546824', 'step': 2959, 'epoch': 3} {'type': 'loss', 'content': 0.005405406001955271, 'timestamp': '2025-09-04 04:11:59.571566', 'step': 2960, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:12:08.056684', 'step': 2960, 'epoch': 3} {'type': 'pplx', 'content': 312.525467653901, 'timestamp': '2025-09-04 04:12:08.058678', 'step': 2960, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2960', 'timestamp': '2025-09-04 04:12:08.564968', 'step': 2960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:12:08.639868', 'step': 2960, 'epoch': 3} {'type': 'loss', 'content': 0.011480286717414856, 'timestamp': '2025-09-04 04:12:08.654576', 'step': 2961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:12:08.756540', 'step': 2961, 'epoch': 3} {'type': 'loss', 'content': 0.01124604418873787, 'timestamp': '2025-09-04 04:12:08.775864', 'step': 2962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:08.876374', 'step': 2962, 'epoch': 3} {'type': 'loss', 'content': 0.005138123407959938, 'timestamp': '2025-09-04 04:12:08.895235', 'step': 2963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:12:09.005049', 'step': 2963, 'epoch': 3} {'type': 'loss', 'content': 0.007659485097974539, 'timestamp': '2025-09-04 04:12:09.026215', 'step': 2964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:12:09.134391', 'step': 2964, 'epoch': 3} {'type': 'loss', 'content': 0.02365208975970745, 'timestamp': '2025-09-04 04:12:09.157039', 'step': 2965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:09.250194', 'step': 2965, 'epoch': 3} {'type': 'loss', 'content': 0.0039781988598406315, 'timestamp': '2025-09-04 04:12:09.267441', 'step': 2966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:12:09.379255', 'step': 2966, 'epoch': 3} {'type': 'loss', 'content': 0.0009690428851172328, 'timestamp': '2025-09-04 04:12:09.399917', 'step': 2967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:09.495805', 'step': 2967, 'epoch': 3} {'type': 'loss', 'content': 0.01487466599792242, 'timestamp': '2025-09-04 04:12:09.514085', 'step': 2968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:12:09.614684', 'step': 2968, 'epoch': 3} {'type': 'loss', 'content': 0.08440519124269485, 'timestamp': '2025-09-04 04:12:09.635750', 'step': 2969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:09.734830', 'step': 2969, 'epoch': 3} {'type': 'loss', 'content': 0.020220091566443443, 'timestamp': '2025-09-04 04:12:09.753528', 'step': 2970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:09.852354', 'step': 2970, 'epoch': 3} {'type': 'loss', 'content': 0.010999851860105991, 'timestamp': '2025-09-04 04:12:09.871044', 'step': 2971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:12:09.956995', 'step': 2971, 'epoch': 3} {'type': 'loss', 'content': 0.022553278133273125, 'timestamp': '2025-09-04 04:12:09.973369', 'step': 2972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:12:10.044334', 'step': 2972, 'epoch': 3} {'type': 'loss', 'content': 0.0029222038574516773, 'timestamp': '2025-09-04 04:12:10.058568', 'step': 2973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:10.162220', 'step': 2973, 'epoch': 3} {'type': 'loss', 'content': 0.015705617144703865, 'timestamp': '2025-09-04 04:12:10.181499', 'step': 2974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 04:12:10.308836', 'step': 2974, 'epoch': 3} {'type': 'loss', 'content': 0.012157633900642395, 'timestamp': '2025-09-04 04:12:10.333367', 'step': 2975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1040], 'flops': 20800126336064.0}, 'timestamp': '2025-09-04 04:12:10.486121', 'step': 2975, 'epoch': 3} {'type': 'loss', 'content': 0.016581635922193527, 'timestamp': '2025-09-04 04:12:10.516149', 'step': 2976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:12:10.617030', 'step': 2976, 'epoch': 3} {'type': 'loss', 'content': 0.012151544913649559, 'timestamp': '2025-09-04 04:12:10.638199', 'step': 2977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:12:10.717488', 'step': 2977, 'epoch': 3} {'type': 'loss', 'content': 0.00917992927134037, 'timestamp': '2025-09-04 04:12:10.731631', 'step': 2978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:10.836079', 'step': 2978, 'epoch': 3} {'type': 'loss', 'content': 0.002120188670232892, 'timestamp': '2025-09-04 04:12:10.855226', 'step': 2979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:10.948080', 'step': 2979, 'epoch': 3} {'type': 'loss', 'content': 0.0006955720018595457, 'timestamp': '2025-09-04 04:12:10.965588', 'step': 2980, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:12:19.458344', 'step': 2980, 'epoch': 3} {'type': 'pplx', 'content': 315.1214843451217, 'timestamp': '2025-09-04 04:12:19.460879', 'step': 2980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:19.548069', 'step': 2980, 'epoch': 3} {'type': 'loss', 'content': 0.003038703231140971, 'timestamp': '2025-09-04 04:12:19.566567', 'step': 2981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:19.678102', 'step': 2981, 'epoch': 3} {'type': 'loss', 'content': 0.0017276513390243053, 'timestamp': '2025-09-04 04:12:19.698535', 'step': 2982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:19.803796', 'step': 2982, 'epoch': 3} {'type': 'loss', 'content': 0.0019167335703969002, 'timestamp': '2025-09-04 04:12:19.823105', 'step': 2983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:19.913285', 'step': 2983, 'epoch': 3} {'type': 'loss', 'content': 0.004277768079191446, 'timestamp': '2025-09-04 04:12:19.930819', 'step': 2984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:20.021819', 'step': 2984, 'epoch': 3} {'type': 'loss', 'content': 0.011646011844277382, 'timestamp': '2025-09-04 04:12:20.040607', 'step': 2985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:12:20.146162', 'step': 2985, 'epoch': 3} {'type': 'loss', 'content': 0.0010585073614493012, 'timestamp': '2025-09-04 04:12:20.166184', 'step': 2986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:20.259593', 'step': 2986, 'epoch': 3} {'type': 'loss', 'content': 0.024997970089316368, 'timestamp': '2025-09-04 04:12:20.276766', 'step': 2987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:12:20.354448', 'step': 2987, 'epoch': 3} {'type': 'loss', 'content': 0.011168187484145164, 'timestamp': '2025-09-04 04:12:20.369389', 'step': 2988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:20.470109', 'step': 2988, 'epoch': 3} {'type': 'loss', 'content': 0.004837450571358204, 'timestamp': '2025-09-04 04:12:20.491265', 'step': 2989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:12:20.568690', 'step': 2989, 'epoch': 3} {'type': 'loss', 'content': 0.02616807632148266, 'timestamp': '2025-09-04 04:12:20.582758', 'step': 2990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:12:20.660612', 'step': 2990, 'epoch': 3} {'type': 'loss', 'content': 0.006141870282590389, 'timestamp': '2025-09-04 04:12:20.674607', 'step': 2991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:12:20.760305', 'step': 2991, 'epoch': 3} {'type': 'loss', 'content': 0.006691917777061462, 'timestamp': '2025-09-04 04:12:20.776741', 'step': 2992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:20.873019', 'step': 2992, 'epoch': 3} {'type': 'loss', 'content': 0.005890341941267252, 'timestamp': '2025-09-04 04:12:20.893402', 'step': 2993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:20.986981', 'step': 2993, 'epoch': 3} {'type': 'loss', 'content': 0.0004606809816323221, 'timestamp': '2025-09-04 04:12:21.004539', 'step': 2994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:21.107947', 'step': 2994, 'epoch': 3} {'type': 'loss', 'content': 0.004719878546893597, 'timestamp': '2025-09-04 04:12:21.127136', 'step': 2995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:12:21.237919', 'step': 2995, 'epoch': 3} {'type': 'loss', 'content': 0.0019325355533510447, 'timestamp': '2025-09-04 04:12:21.259351', 'step': 2996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:21.355991', 'step': 2996, 'epoch': 3} {'type': 'loss', 'content': 0.0065997205674648285, 'timestamp': '2025-09-04 04:12:21.376493', 'step': 2997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:21.466283', 'step': 2997, 'epoch': 3} {'type': 'loss', 'content': 0.02511041797697544, 'timestamp': '2025-09-04 04:12:21.483091', 'step': 2998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:21.585402', 'step': 2998, 'epoch': 3} {'type': 'loss', 'content': 0.0015789009630680084, 'timestamp': '2025-09-04 04:12:21.604698', 'step': 2999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:21.703899', 'step': 2999, 'epoch': 3} {'type': 'loss', 'content': 0.010090545751154423, 'timestamp': '2025-09-04 04:12:21.723385', 'step': 3000, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:12:30.115857', 'step': 3000, 'epoch': 3} {'type': 'pplx', 'content': 318.48539959590164, 'timestamp': '2025-09-04 04:12:30.117830', 'step': 3000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3000', 'timestamp': '2025-09-04 04:12:30.470352', 'step': 3000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:12:30.569127', 'step': 3000, 'epoch': 3} {'type': 'loss', 'content': 0.0071251485496759415, 'timestamp': '2025-09-04 04:12:30.589877', 'step': 3001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:30.682094', 'step': 3001, 'epoch': 3} {'type': 'loss', 'content': 0.00022971679572947323, 'timestamp': '2025-09-04 04:12:30.699269', 'step': 3002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:12:30.784778', 'step': 3002, 'epoch': 3} {'type': 'loss', 'content': 0.013213549740612507, 'timestamp': '2025-09-04 04:12:30.800298', 'step': 3003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:12:30.872344', 'step': 3003, 'epoch': 3} {'type': 'loss', 'content': 0.002448985120281577, 'timestamp': '2025-09-04 04:12:30.886091', 'step': 3004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:30.976093', 'step': 3004, 'epoch': 3} {'type': 'loss', 'content': 0.00946701131761074, 'timestamp': '2025-09-04 04:12:30.994794', 'step': 3005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:12:31.079497', 'step': 3005, 'epoch': 3} {'type': 'loss', 'content': 0.027095887809991837, 'timestamp': '2025-09-04 04:12:31.095018', 'step': 3006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:31.188994', 'step': 3006, 'epoch': 3} {'type': 'loss', 'content': 0.011162908747792244, 'timestamp': '2025-09-04 04:12:31.206237', 'step': 3007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:12:31.281961', 'step': 3007, 'epoch': 3} {'type': 'loss', 'content': 0.002971187699586153, 'timestamp': '2025-09-04 04:12:31.296302', 'step': 3008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:31.401735', 'step': 3008, 'epoch': 3} {'type': 'loss', 'content': 0.0016095120226964355, 'timestamp': '2025-09-04 04:12:31.424309', 'step': 3009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:12:31.531150', 'step': 3009, 'epoch': 3} {'type': 'loss', 'content': 0.0003574864531401545, 'timestamp': '2025-09-04 04:12:31.551257', 'step': 3010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:31.644083', 'step': 3010, 'epoch': 3} {'type': 'loss', 'content': 0.05357489734888077, 'timestamp': '2025-09-04 04:12:31.660973', 'step': 3011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:31.762905', 'step': 3011, 'epoch': 3} {'type': 'loss', 'content': 0.025260241702198982, 'timestamp': '2025-09-04 04:12:31.781102', 'step': 3012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:12:31.895906', 'step': 3012, 'epoch': 3} {'type': 'loss', 'content': 0.002299356274306774, 'timestamp': '2025-09-04 04:12:31.920205', 'step': 3013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:32.012619', 'step': 3013, 'epoch': 3} {'type': 'loss', 'content': 0.005653650965541601, 'timestamp': '2025-09-04 04:12:32.029774', 'step': 3014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:32.138215', 'step': 3014, 'epoch': 3} {'type': 'loss', 'content': 0.004577314481139183, 'timestamp': '2025-09-04 04:12:32.157429', 'step': 3015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:12:32.234613', 'step': 3015, 'epoch': 3} {'type': 'loss', 'content': 0.007822668179869652, 'timestamp': '2025-09-04 04:12:32.249408', 'step': 3016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:32.349594', 'step': 3016, 'epoch': 3} {'type': 'loss', 'content': 0.0035554529167711735, 'timestamp': '2025-09-04 04:12:32.370303', 'step': 3017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:12:32.446897', 'step': 3017, 'epoch': 3} {'type': 'loss', 'content': 0.045765411108732224, 'timestamp': '2025-09-04 04:12:32.460693', 'step': 3018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:12:32.597497', 'step': 3018, 'epoch': 3} {'type': 'loss', 'content': 0.0015078384894877672, 'timestamp': '2025-09-04 04:12:32.623398', 'step': 3019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:32.724410', 'step': 3019, 'epoch': 3} {'type': 'loss', 'content': 0.0004389840178191662, 'timestamp': '2025-09-04 04:12:32.743531', 'step': 3020, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:12:41.245228', 'step': 3020, 'epoch': 3} {'type': 'pplx', 'content': 316.8819015781872, 'timestamp': '2025-09-04 04:12:41.247591', 'step': 3020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:41.352031', 'step': 3020, 'epoch': 3} {'type': 'loss', 'content': 0.01975194923579693, 'timestamp': '2025-09-04 04:12:41.374506', 'step': 3021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:41.485738', 'step': 3021, 'epoch': 3} {'type': 'loss', 'content': 0.002616028068587184, 'timestamp': '2025-09-04 04:12:41.506249', 'step': 3022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:12:41.626978', 'step': 3022, 'epoch': 3} {'type': 'loss', 'content': 0.014300533570349216, 'timestamp': '2025-09-04 04:12:41.648721', 'step': 3023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:12:41.733490', 'step': 3023, 'epoch': 3} {'type': 'loss', 'content': 0.020479435101151466, 'timestamp': '2025-09-04 04:12:41.749264', 'step': 3024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:41.843852', 'step': 3024, 'epoch': 3} {'type': 'loss', 'content': 0.005946870427578688, 'timestamp': '2025-09-04 04:12:41.863014', 'step': 3025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:41.966192', 'step': 3025, 'epoch': 3} {'type': 'loss', 'content': 0.020112913101911545, 'timestamp': '2025-09-04 04:12:41.984766', 'step': 3026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:42.080065', 'step': 3026, 'epoch': 3} {'type': 'loss', 'content': 0.0003505939384922385, 'timestamp': '2025-09-04 04:12:42.097153', 'step': 3027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:12:42.212691', 'step': 3027, 'epoch': 3} {'type': 'loss', 'content': 0.018574239686131477, 'timestamp': '2025-09-04 04:12:42.234095', 'step': 3028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:42.336816', 'step': 3028, 'epoch': 3} {'type': 'loss', 'content': 0.006098731886595488, 'timestamp': '2025-09-04 04:12:42.357909', 'step': 3029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:42.466999', 'step': 3029, 'epoch': 3} {'type': 'loss', 'content': 0.011256312020123005, 'timestamp': '2025-09-04 04:12:42.484487', 'step': 3030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:42.581316', 'step': 3030, 'epoch': 3} {'type': 'loss', 'content': 0.02321520633995533, 'timestamp': '2025-09-04 04:12:42.598704', 'step': 3031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:42.695635', 'step': 3031, 'epoch': 3} {'type': 'loss', 'content': 0.013380464166402817, 'timestamp': '2025-09-04 04:12:42.713836', 'step': 3032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:12:42.827414', 'step': 3032, 'epoch': 3} {'type': 'loss', 'content': 0.06969982385635376, 'timestamp': '2025-09-04 04:12:42.849634', 'step': 3033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:42.960351', 'step': 3033, 'epoch': 3} {'type': 'loss', 'content': 0.001032329280860722, 'timestamp': '2025-09-04 04:12:42.980870', 'step': 3034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:43.081469', 'step': 3034, 'epoch': 3} {'type': 'loss', 'content': 0.00046730105532333255, 'timestamp': '2025-09-04 04:12:43.099978', 'step': 3035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:43.202230', 'step': 3035, 'epoch': 3} {'type': 'loss', 'content': 0.0037853161338716745, 'timestamp': '2025-09-04 04:12:43.221820', 'step': 3036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:12:43.303940', 'step': 3036, 'epoch': 3} {'type': 'loss', 'content': 0.0006179303163662553, 'timestamp': '2025-09-04 04:12:43.320561', 'step': 3037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:43.411798', 'step': 3037, 'epoch': 3} {'type': 'loss', 'content': 0.017078617587685585, 'timestamp': '2025-09-04 04:12:43.428547', 'step': 3038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:43.526095', 'step': 3038, 'epoch': 3} {'type': 'loss', 'content': 0.0008244368946179748, 'timestamp': '2025-09-04 04:12:43.543550', 'step': 3039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:43.648130', 'step': 3039, 'epoch': 3} {'type': 'loss', 'content': 0.0033969872165471315, 'timestamp': '2025-09-04 04:12:43.668121', 'step': 3040, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:12:52.082447', 'step': 3040, 'epoch': 3} {'type': 'pplx', 'content': 313.3448310983682, 'timestamp': '2025-09-04 04:12:52.084610', 'step': 3040, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3040', 'timestamp': '2025-09-04 04:12:52.615369', 'step': 3040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:52.716905', 'step': 3040, 'epoch': 3} {'type': 'loss', 'content': 0.007473244331777096, 'timestamp': '2025-09-04 04:12:52.737524', 'step': 3041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1488], 'flops': 29760180728640.0}, 'timestamp': '2025-09-04 04:12:52.963136', 'step': 3041, 'epoch': 3} {'type': 'loss', 'content': 0.0020905376877635717, 'timestamp': '2025-09-04 04:12:53.005306', 'step': 3042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:53.111301', 'step': 3042, 'epoch': 3} {'type': 'loss', 'content': 0.004675985313951969, 'timestamp': '2025-09-04 04:12:53.130386', 'step': 3043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:53.226870', 'step': 3043, 'epoch': 3} {'type': 'loss', 'content': 0.004783936310559511, 'timestamp': '2025-09-04 04:12:53.244571', 'step': 3044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:12:53.320321', 'step': 3044, 'epoch': 3} {'type': 'loss', 'content': 0.024307608604431152, 'timestamp': '2025-09-04 04:12:53.334873', 'step': 3045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:53.448940', 'step': 3045, 'epoch': 3} {'type': 'loss', 'content': 0.0014469543239101768, 'timestamp': '2025-09-04 04:12:53.469323', 'step': 3046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:12:53.556850', 'step': 3046, 'epoch': 3} {'type': 'loss', 'content': 0.01129270438104868, 'timestamp': '2025-09-04 04:12:53.572473', 'step': 3047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:53.663644', 'step': 3047, 'epoch': 3} {'type': 'loss', 'content': 0.009008025750517845, 'timestamp': '2025-09-04 04:12:53.681178', 'step': 3048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:12:53.784218', 'step': 3048, 'epoch': 3} {'type': 'loss', 'content': 0.05535700172185898, 'timestamp': '2025-09-04 04:12:53.806182', 'step': 3049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:12:53.908028', 'step': 3049, 'epoch': 3} {'type': 'loss', 'content': 0.00018787295266520232, 'timestamp': '2025-09-04 04:12:53.927211', 'step': 3050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:12:54.021968', 'step': 3050, 'epoch': 3} {'type': 'loss', 'content': 0.0013808540534228086, 'timestamp': '2025-09-04 04:12:54.039368', 'step': 3051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:12:54.131919', 'step': 3051, 'epoch': 3} {'type': 'loss', 'content': 0.001885344390757382, 'timestamp': '2025-09-04 04:12:54.149844', 'step': 3052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:12:54.250441', 'step': 3052, 'epoch': 3} {'type': 'loss', 'content': 0.007413564249873161, 'timestamp': '2025-09-04 04:12:54.271642', 'step': 3053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:54.365785', 'step': 3053, 'epoch': 3} {'type': 'loss', 'content': 0.004210531245917082, 'timestamp': '2025-09-04 04:12:54.383238', 'step': 3054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:12:54.492486', 'step': 3054, 'epoch': 3} {'type': 'loss', 'content': 0.004505162592977285, 'timestamp': '2025-09-04 04:12:54.513000', 'step': 3055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:12:54.604034', 'step': 3055, 'epoch': 3} {'type': 'loss', 'content': 0.0008125255117192864, 'timestamp': '2025-09-04 04:12:54.621731', 'step': 3056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:12:54.702929', 'step': 3056, 'epoch': 3} {'type': 'loss', 'content': 0.011885403655469418, 'timestamp': '2025-09-04 04:12:54.719635', 'step': 3057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:12:54.814604', 'step': 3057, 'epoch': 3} {'type': 'loss', 'content': 0.004823492839932442, 'timestamp': '2025-09-04 04:12:54.832086', 'step': 3058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:12:54.932251', 'step': 3058, 'epoch': 3} {'type': 'loss', 'content': 0.023230794817209244, 'timestamp': '2025-09-04 04:12:54.951261', 'step': 3059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:12:55.052619', 'step': 3059, 'epoch': 3} {'type': 'loss', 'content': 0.06023210659623146, 'timestamp': '2025-09-04 04:12:55.071941', 'step': 3060, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:03.577234', 'step': 3060, 'epoch': 3} {'type': 'pplx', 'content': 310.1316931403451, 'timestamp': '2025-09-04 04:13:03.579410', 'step': 3060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:13:03.660065', 'step': 3060, 'epoch': 3} {'type': 'loss', 'content': 0.005574073176831007, 'timestamp': '2025-09-04 04:13:03.676255', 'step': 3061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:03.783932', 'step': 3061, 'epoch': 3} {'type': 'loss', 'content': 0.0004931015428155661, 'timestamp': '2025-09-04 04:13:03.803610', 'step': 3062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:03.910704', 'step': 3062, 'epoch': 3} {'type': 'loss', 'content': 0.005763449240475893, 'timestamp': '2025-09-04 04:13:03.927601', 'step': 3063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:13:04.032696', 'step': 3063, 'epoch': 3} {'type': 'loss', 'content': 0.015147325582802296, 'timestamp': '2025-09-04 04:13:04.052545', 'step': 3064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:04.151878', 'step': 3064, 'epoch': 3} {'type': 'loss', 'content': 0.013065881095826626, 'timestamp': '2025-09-04 04:13:04.172337', 'step': 3065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:13:04.257765', 'step': 3065, 'epoch': 3} {'type': 'loss', 'content': 0.06708889454603195, 'timestamp': '2025-09-04 04:13:04.272762', 'step': 3066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:13:04.478255', 'step': 3066, 'epoch': 3} {'type': 'loss', 'content': 0.0030464141163975, 'timestamp': '2025-09-04 04:13:04.517184', 'step': 3067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:13:04.629622', 'step': 3067, 'epoch': 3} {'type': 'loss', 'content': 0.004817434120923281, 'timestamp': '2025-09-04 04:13:04.650836', 'step': 3068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:13:04.727967', 'step': 3068, 'epoch': 3} {'type': 'loss', 'content': 0.010416961275041103, 'timestamp': '2025-09-04 04:13:04.742894', 'step': 3069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:13:04.853693', 'step': 3069, 'epoch': 3} {'type': 'loss', 'content': 0.01584682986140251, 'timestamp': '2025-09-04 04:13:04.873990', 'step': 3070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:13:04.953426', 'step': 3070, 'epoch': 3} {'type': 'loss', 'content': 0.0056066811084747314, 'timestamp': '2025-09-04 04:13:04.967223', 'step': 3071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:13:05.041133', 'step': 3071, 'epoch': 3} {'type': 'loss', 'content': 0.0062073455192148685, 'timestamp': '2025-09-04 04:13:05.054608', 'step': 3072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:13:05.176203', 'step': 3072, 'epoch': 3} {'type': 'loss', 'content': 0.007986130192875862, 'timestamp': '2025-09-04 04:13:05.201566', 'step': 3073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:13:05.280097', 'step': 3073, 'epoch': 3} {'type': 'loss', 'content': 0.002340728882700205, 'timestamp': '2025-09-04 04:13:05.293930', 'step': 3074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 04:13:05.428660', 'step': 3074, 'epoch': 3} {'type': 'loss', 'content': 0.01518856268376112, 'timestamp': '2025-09-04 04:13:05.453088', 'step': 3075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:05.553750', 'step': 3075, 'epoch': 3} {'type': 'loss', 'content': 0.011933338828384876, 'timestamp': '2025-09-04 04:13:05.573036', 'step': 3076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:05.665519', 'step': 3076, 'epoch': 3} {'type': 'loss', 'content': 0.019831262528896332, 'timestamp': '2025-09-04 04:13:05.684346', 'step': 3077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:05.779207', 'step': 3077, 'epoch': 3} {'type': 'loss', 'content': 0.00772702693939209, 'timestamp': '2025-09-04 04:13:05.796138', 'step': 3078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:13:05.886183', 'step': 3078, 'epoch': 3} {'type': 'loss', 'content': 0.03503193333745003, 'timestamp': '2025-09-04 04:13:05.901623', 'step': 3079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:05.996518', 'step': 3079, 'epoch': 3} {'type': 'loss', 'content': 0.03195195645093918, 'timestamp': '2025-09-04 04:13:06.014248', 'step': 3080, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:14.451244', 'step': 3080, 'epoch': 3} {'type': 'pplx', 'content': 309.69911719013754, 'timestamp': '2025-09-04 04:13:14.453284', 'step': 3080, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3080', 'timestamp': '2025-09-04 04:13:14.996139', 'step': 3080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:15.099957', 'step': 3080, 'epoch': 3} {'type': 'loss', 'content': 0.005432614590972662, 'timestamp': '2025-09-04 04:13:15.122154', 'step': 3081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:15.215538', 'step': 3081, 'epoch': 3} {'type': 'loss', 'content': 0.07827738672494888, 'timestamp': '2025-09-04 04:13:15.232831', 'step': 3082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:13:15.335844', 'step': 3082, 'epoch': 3} {'type': 'loss', 'content': 0.0020832966547459364, 'timestamp': '2025-09-04 04:13:15.355072', 'step': 3083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:15.464323', 'step': 3083, 'epoch': 3} {'type': 'loss', 'content': 0.003370664082467556, 'timestamp': '2025-09-04 04:13:15.485209', 'step': 3084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:15.590624', 'step': 3084, 'epoch': 3} {'type': 'loss', 'content': 0.0391119010746479, 'timestamp': '2025-09-04 04:13:15.612896', 'step': 3085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:15.719186', 'step': 3085, 'epoch': 3} {'type': 'loss', 'content': 0.0071410383097827435, 'timestamp': '2025-09-04 04:13:15.739191', 'step': 3086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:15.846022', 'step': 3086, 'epoch': 3} {'type': 'loss', 'content': 0.044355932623147964, 'timestamp': '2025-09-04 04:13:15.866073', 'step': 3087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:15.971664', 'step': 3087, 'epoch': 3} {'type': 'loss', 'content': 0.02203553542494774, 'timestamp': '2025-09-04 04:13:15.992432', 'step': 3088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:13:16.066807', 'step': 3088, 'epoch': 3} {'type': 'loss', 'content': 0.002954278141260147, 'timestamp': '2025-09-04 04:13:16.081846', 'step': 3089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:13:16.163704', 'step': 3089, 'epoch': 3} {'type': 'loss', 'content': 0.0011478732340037823, 'timestamp': '2025-09-04 04:13:16.178843', 'step': 3090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:13:16.274385', 'step': 3090, 'epoch': 3} {'type': 'loss', 'content': 0.03504842892289162, 'timestamp': '2025-09-04 04:13:16.291859', 'step': 3091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:13:16.368898', 'step': 3091, 'epoch': 3} {'type': 'loss', 'content': 0.0484900027513504, 'timestamp': '2025-09-04 04:13:16.383661', 'step': 3092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:16.472091', 'step': 3092, 'epoch': 3} {'type': 'loss', 'content': 0.02142166718840599, 'timestamp': '2025-09-04 04:13:16.490533', 'step': 3093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:13:16.574016', 'step': 3093, 'epoch': 3} {'type': 'loss', 'content': 0.0017112598288804293, 'timestamp': '2025-09-04 04:13:16.589190', 'step': 3094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:16.698367', 'step': 3094, 'epoch': 3} {'type': 'loss', 'content': 0.0018777156947180629, 'timestamp': '2025-09-04 04:13:16.718616', 'step': 3095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:16.818290', 'step': 3095, 'epoch': 3} {'type': 'loss', 'content': 0.04635737091302872, 'timestamp': '2025-09-04 04:13:16.837679', 'step': 3096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:13:16.918617', 'step': 3096, 'epoch': 3} {'type': 'loss', 'content': 0.000942113867495209, 'timestamp': '2025-09-04 04:13:16.933954', 'step': 3097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:13:17.010857', 'step': 3097, 'epoch': 3} {'type': 'loss', 'content': 0.0178525447845459, 'timestamp': '2025-09-04 04:13:17.024613', 'step': 3098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:13:17.100145', 'step': 3098, 'epoch': 3} {'type': 'loss', 'content': 0.002621579449623823, 'timestamp': '2025-09-04 04:13:17.113931', 'step': 3099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:13:17.197423', 'step': 3099, 'epoch': 3} {'type': 'loss', 'content': 0.003176899626851082, 'timestamp': '2025-09-04 04:13:17.213366', 'step': 3100, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:25.591497', 'step': 3100, 'epoch': 3} {'type': 'pplx', 'content': 309.6383679000778, 'timestamp': '2025-09-04 04:13:25.594304', 'step': 3100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:13:25.708607', 'step': 3100, 'epoch': 3} {'type': 'loss', 'content': 0.018567724153399467, 'timestamp': '2025-09-04 04:13:25.732430', 'step': 3101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:13:25.836648', 'step': 3101, 'epoch': 3} {'type': 'loss', 'content': 0.001145319314673543, 'timestamp': '2025-09-04 04:13:25.855894', 'step': 3102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:25.953996', 'step': 3102, 'epoch': 3} {'type': 'loss', 'content': 0.0030605667270720005, 'timestamp': '2025-09-04 04:13:25.972593', 'step': 3103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:13:26.083056', 'step': 3103, 'epoch': 3} {'type': 'loss', 'content': 0.003627515397965908, 'timestamp': '2025-09-04 04:13:26.104379', 'step': 3104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:26.195117', 'step': 3104, 'epoch': 3} {'type': 'loss', 'content': 0.0005803365493193269, 'timestamp': '2025-09-04 04:13:26.214144', 'step': 3105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:13:26.288628', 'step': 3105, 'epoch': 3} {'type': 'loss', 'content': 0.001379349734634161, 'timestamp': '2025-09-04 04:13:26.302105', 'step': 3106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:13:26.411779', 'step': 3106, 'epoch': 3} {'type': 'loss', 'content': 0.01873205043375492, 'timestamp': '2025-09-04 04:13:26.432178', 'step': 3107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:13:26.535336', 'step': 3107, 'epoch': 3} {'type': 'loss', 'content': 0.0028933845460414886, 'timestamp': '2025-09-04 04:13:26.555296', 'step': 3108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:26.660819', 'step': 3108, 'epoch': 3} {'type': 'loss', 'content': 0.001489466754719615, 'timestamp': '2025-09-04 04:13:26.682708', 'step': 3109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:13:26.757809', 'step': 3109, 'epoch': 3} {'type': 'loss', 'content': 0.002094303723424673, 'timestamp': '2025-09-04 04:13:26.771384', 'step': 3110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:26.866276', 'step': 3110, 'epoch': 3} {'type': 'loss', 'content': 0.0006700966041535139, 'timestamp': '2025-09-04 04:13:26.883690', 'step': 3111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:26.983151', 'step': 3111, 'epoch': 3} {'type': 'loss', 'content': 0.01674632728099823, 'timestamp': '2025-09-04 04:13:27.002754', 'step': 3112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:27.105954', 'step': 3112, 'epoch': 3} {'type': 'loss', 'content': 0.08993439376354218, 'timestamp': '2025-09-04 04:13:27.127787', 'step': 3113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:13:27.237145', 'step': 3113, 'epoch': 3} {'type': 'loss', 'content': 0.012733870185911655, 'timestamp': '2025-09-04 04:13:27.257788', 'step': 3114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:13:27.330723', 'step': 3114, 'epoch': 3} {'type': 'loss', 'content': 0.025248989462852478, 'timestamp': '2025-09-04 04:13:27.343584', 'step': 3115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:27.437623', 'step': 3115, 'epoch': 3} {'type': 'loss', 'content': 0.0020341791678220034, 'timestamp': '2025-09-04 04:13:27.455792', 'step': 3116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:27.547797', 'step': 3116, 'epoch': 3} {'type': 'loss', 'content': 0.0012790296459570527, 'timestamp': '2025-09-04 04:13:27.566869', 'step': 3117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:13:27.670689', 'step': 3117, 'epoch': 3} {'type': 'loss', 'content': 0.003262518672272563, 'timestamp': '2025-09-04 04:13:27.689875', 'step': 3118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:27.782359', 'step': 3118, 'epoch': 3} {'type': 'loss', 'content': 0.04332924634218216, 'timestamp': '2025-09-04 04:13:27.799486', 'step': 3119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:13:27.894946', 'step': 3119, 'epoch': 3} {'type': 'loss', 'content': 0.0012113949051126838, 'timestamp': '2025-09-04 04:13:27.913216', 'step': 3120, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:36.378564', 'step': 3120, 'epoch': 3} {'type': 'pplx', 'content': 307.53064608444055, 'timestamp': '2025-09-04 04:13:36.380625', 'step': 3120, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3120', 'timestamp': '2025-09-04 04:13:36.827667', 'step': 3120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:13:36.933924', 'step': 3120, 'epoch': 3} {'type': 'loss', 'content': 0.0004694383533205837, 'timestamp': '2025-09-04 04:13:36.956474', 'step': 3121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:37.050096', 'step': 3121, 'epoch': 3} {'type': 'loss', 'content': 0.00680502038449049, 'timestamp': '2025-09-04 04:13:37.067253', 'step': 3122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:37.166392', 'step': 3122, 'epoch': 3} {'type': 'loss', 'content': 0.009919785894453526, 'timestamp': '2025-09-04 04:13:37.184994', 'step': 3123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:13:37.286944', 'step': 3123, 'epoch': 3} {'type': 'loss', 'content': 0.016770748421549797, 'timestamp': '2025-09-04 04:13:37.307060', 'step': 3124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:37.398424', 'step': 3124, 'epoch': 3} {'type': 'loss', 'content': 0.005784905049949884, 'timestamp': '2025-09-04 04:13:37.417173', 'step': 3125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:37.517434', 'step': 3125, 'epoch': 3} {'type': 'loss', 'content': 0.02264421060681343, 'timestamp': '2025-09-04 04:13:37.536320', 'step': 3126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 04:13:37.600702', 'step': 3126, 'epoch': 3} {'type': 'loss', 'content': 0.004687449894845486, 'timestamp': '2025-09-04 04:13:37.611963', 'step': 3127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:13:37.728323', 'step': 3127, 'epoch': 3} {'type': 'loss', 'content': 0.051042910665273666, 'timestamp': '2025-09-04 04:13:37.751279', 'step': 3128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:37.840140', 'step': 3128, 'epoch': 3} {'type': 'loss', 'content': 0.0002769632264971733, 'timestamp': '2025-09-04 04:13:37.858257', 'step': 3129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:13:37.936618', 'step': 3129, 'epoch': 3} {'type': 'loss', 'content': 0.011999246664345264, 'timestamp': '2025-09-04 04:13:37.950724', 'step': 3130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:38.050492', 'step': 3130, 'epoch': 3} {'type': 'loss', 'content': 0.005194882862269878, 'timestamp': '2025-09-04 04:13:38.069413', 'step': 3131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:38.169541', 'step': 3131, 'epoch': 3} {'type': 'loss', 'content': 0.037000637501478195, 'timestamp': '2025-09-04 04:13:38.189321', 'step': 3132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:13:38.273497', 'step': 3132, 'epoch': 3} {'type': 'loss', 'content': 0.010427704080939293, 'timestamp': '2025-09-04 04:13:38.290645', 'step': 3133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:13:38.396605', 'step': 3133, 'epoch': 3} {'type': 'loss', 'content': 0.002125280909240246, 'timestamp': '2025-09-04 04:13:38.416728', 'step': 3134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:38.512019', 'step': 3134, 'epoch': 3} {'type': 'loss', 'content': 0.0023927779402583838, 'timestamp': '2025-09-04 04:13:38.529572', 'step': 3135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:38.628419', 'step': 3135, 'epoch': 3} {'type': 'loss', 'content': 0.00024334284535143524, 'timestamp': '2025-09-04 04:13:38.647962', 'step': 3136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:38.738054', 'step': 3136, 'epoch': 3} {'type': 'loss', 'content': 0.0012799968244507909, 'timestamp': '2025-09-04 04:13:38.756943', 'step': 3137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:13:38.859262', 'step': 3137, 'epoch': 3} {'type': 'loss', 'content': 0.010953391902148724, 'timestamp': '2025-09-04 04:13:38.878460', 'step': 3138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:13:38.956198', 'step': 3138, 'epoch': 3} {'type': 'loss', 'content': 0.0005597640410996974, 'timestamp': '2025-09-04 04:13:38.970039', 'step': 3139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:39.060351', 'step': 3139, 'epoch': 3} {'type': 'loss', 'content': 0.017131542786955833, 'timestamp': '2025-09-04 04:13:39.077857', 'step': 3140, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:47.459705', 'step': 3140, 'epoch': 3} {'type': 'pplx', 'content': 303.543175064668, 'timestamp': '2025-09-04 04:13:47.461952', 'step': 3140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:13:47.544243', 'step': 3140, 'epoch': 3} {'type': 'loss', 'content': 0.005871990229934454, 'timestamp': '2025-09-04 04:13:47.561346', 'step': 3141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:13:47.683316', 'step': 3141, 'epoch': 3} {'type': 'loss', 'content': 0.004386106040328741, 'timestamp': '2025-09-04 04:13:47.706499', 'step': 3142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:47.800908', 'step': 3142, 'epoch': 3} {'type': 'loss', 'content': 0.0068616243079304695, 'timestamp': '2025-09-04 04:13:47.818294', 'step': 3143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:47.911953', 'step': 3143, 'epoch': 3} {'type': 'loss', 'content': 0.0012470950605347753, 'timestamp': '2025-09-04 04:13:47.930050', 'step': 3144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:13:48.062441', 'step': 3144, 'epoch': 3} {'type': 'loss', 'content': 0.008273441344499588, 'timestamp': '2025-09-04 04:13:48.090809', 'step': 3145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:48.190377', 'step': 3145, 'epoch': 3} {'type': 'loss', 'content': 0.05135876312851906, 'timestamp': '2025-09-04 04:13:48.209014', 'step': 3146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:48.301863', 'step': 3146, 'epoch': 3} {'type': 'loss', 'content': 0.026613635942339897, 'timestamp': '2025-09-04 04:13:48.318989', 'step': 3147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:48.427395', 'step': 3147, 'epoch': 3} {'type': 'loss', 'content': 0.006781783886253834, 'timestamp': '2025-09-04 04:13:48.448382', 'step': 3148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:48.540151', 'step': 3148, 'epoch': 3} {'type': 'loss', 'content': 0.0013767415657639503, 'timestamp': '2025-09-04 04:13:48.558954', 'step': 3149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:13:48.663250', 'step': 3149, 'epoch': 3} {'type': 'loss', 'content': 0.06130528450012207, 'timestamp': '2025-09-04 04:13:48.682512', 'step': 3150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:13:48.792712', 'step': 3150, 'epoch': 3} {'type': 'loss', 'content': 0.011846323497593403, 'timestamp': '2025-09-04 04:13:48.813358', 'step': 3151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:48.904857', 'step': 3151, 'epoch': 3} {'type': 'loss', 'content': 0.015273387543857098, 'timestamp': '2025-09-04 04:13:48.922531', 'step': 3152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:13:49.043109', 'step': 3152, 'epoch': 3} {'type': 'loss', 'content': 0.0032542271073907614, 'timestamp': '2025-09-04 04:13:49.068724', 'step': 3153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:49.161574', 'step': 3153, 'epoch': 3} {'type': 'loss', 'content': 0.002285633934661746, 'timestamp': '2025-09-04 04:13:49.178731', 'step': 3154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:13:49.280117', 'step': 3154, 'epoch': 3} {'type': 'loss', 'content': 0.009557440876960754, 'timestamp': '2025-09-04 04:13:49.299107', 'step': 3155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:49.389864', 'step': 3155, 'epoch': 3} {'type': 'loss', 'content': 0.018284492194652557, 'timestamp': '2025-09-04 04:13:49.407496', 'step': 3156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:13:49.498858', 'step': 3156, 'epoch': 3} {'type': 'loss', 'content': 0.0062835500575602055, 'timestamp': '2025-09-04 04:13:49.517799', 'step': 3157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:13:49.595561', 'step': 3157, 'epoch': 3} {'type': 'loss', 'content': 0.006542867515236139, 'timestamp': '2025-09-04 04:13:49.609201', 'step': 3158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:49.708783', 'step': 3158, 'epoch': 3} {'type': 'loss', 'content': 0.004803582560271025, 'timestamp': '2025-09-04 04:13:49.727388', 'step': 3159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:13:49.813834', 'step': 3159, 'epoch': 3} {'type': 'loss', 'content': 0.005237384233623743, 'timestamp': '2025-09-04 04:13:49.830356', 'step': 3160, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:13:58.274255', 'step': 3160, 'epoch': 3} {'type': 'pplx', 'content': 299.2577897885672, 'timestamp': '2025-09-04 04:13:58.276754', 'step': 3160, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3160', 'timestamp': '2025-09-04 04:13:58.818167', 'step': 3160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:13:58.901615', 'step': 3160, 'epoch': 3} {'type': 'loss', 'content': 0.012152140960097313, 'timestamp': '2025-09-04 04:13:58.917815', 'step': 3161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:59.010068', 'step': 3161, 'epoch': 3} {'type': 'loss', 'content': 0.0011444678530097008, 'timestamp': '2025-09-04 04:13:59.026556', 'step': 3162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:13:59.127832', 'step': 3162, 'epoch': 3} {'type': 'loss', 'content': 0.0307242963463068, 'timestamp': '2025-09-04 04:13:59.146322', 'step': 3163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:13:59.262752', 'step': 3163, 'epoch': 3} {'type': 'loss', 'content': 0.009461762383580208, 'timestamp': '2025-09-04 04:13:59.284146', 'step': 3164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:13:59.373507', 'step': 3164, 'epoch': 3} {'type': 'loss', 'content': 0.0035491350572556257, 'timestamp': '2025-09-04 04:13:59.391646', 'step': 3165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:13:59.501388', 'step': 3165, 'epoch': 3} {'type': 'loss', 'content': 0.0005383518873713911, 'timestamp': '2025-09-04 04:13:59.521601', 'step': 3166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:13:59.632898', 'step': 3166, 'epoch': 3} {'type': 'loss', 'content': 0.005555047653615475, 'timestamp': '2025-09-04 04:13:59.653292', 'step': 3167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:13:59.738310', 'step': 3167, 'epoch': 3} {'type': 'loss', 'content': 0.0026055641938000917, 'timestamp': '2025-09-04 04:13:59.754242', 'step': 3168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:13:59.846632', 'step': 3168, 'epoch': 3} {'type': 'loss', 'content': 0.007750915363430977, 'timestamp': '2025-09-04 04:13:59.865546', 'step': 3169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:13:59.975571', 'step': 3169, 'epoch': 3} {'type': 'loss', 'content': 0.030947135761380196, 'timestamp': '2025-09-04 04:13:59.996080', 'step': 3170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:00.095959', 'step': 3170, 'epoch': 3} {'type': 'loss', 'content': 0.008231641724705696, 'timestamp': '2025-09-04 04:14:00.113266', 'step': 3171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:00.217518', 'step': 3171, 'epoch': 3} {'type': 'loss', 'content': 0.0042179482989013195, 'timestamp': '2025-09-04 04:14:00.237314', 'step': 3172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:14:00.346381', 'step': 3172, 'epoch': 3} {'type': 'loss', 'content': 0.0010643589776009321, 'timestamp': '2025-09-04 04:14:00.368931', 'step': 3173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:00.473465', 'step': 3173, 'epoch': 3} {'type': 'loss', 'content': 0.01745988056063652, 'timestamp': '2025-09-04 04:14:00.492705', 'step': 3174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:14:00.586556', 'step': 3174, 'epoch': 3} {'type': 'loss', 'content': 0.040873829275369644, 'timestamp': '2025-09-04 04:14:00.603677', 'step': 3175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:00.705209', 'step': 3175, 'epoch': 3} {'type': 'loss', 'content': 0.0004095417389180511, 'timestamp': '2025-09-04 04:14:00.724761', 'step': 3176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:00.826009', 'step': 3176, 'epoch': 3} {'type': 'loss', 'content': 0.05139699578285217, 'timestamp': '2025-09-04 04:14:00.847061', 'step': 3177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:00.947870', 'step': 3177, 'epoch': 3} {'type': 'loss', 'content': 0.01573573239147663, 'timestamp': '2025-09-04 04:14:00.966290', 'step': 3178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:14:01.038915', 'step': 3178, 'epoch': 3} {'type': 'loss', 'content': 0.004238112363964319, 'timestamp': '2025-09-04 04:14:01.051828', 'step': 3179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:01.146291', 'step': 3179, 'epoch': 3} {'type': 'loss', 'content': 0.013266210444271564, 'timestamp': '2025-09-04 04:14:01.164460', 'step': 3180, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:14:09.548016', 'step': 3180, 'epoch': 3} {'type': 'pplx', 'content': 297.72411337269, 'timestamp': '2025-09-04 04:14:09.549979', 'step': 3180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:09.645901', 'step': 3180, 'epoch': 3} {'type': 'loss', 'content': 0.004828155972063541, 'timestamp': '2025-09-04 04:14:09.666733', 'step': 3181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:14:09.737653', 'step': 3181, 'epoch': 3} {'type': 'loss', 'content': 0.00019382215396035463, 'timestamp': '2025-09-04 04:14:09.750394', 'step': 3182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:09.843532', 'step': 3182, 'epoch': 3} {'type': 'loss', 'content': 0.012143196538090706, 'timestamp': '2025-09-04 04:14:09.860957', 'step': 3183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:09.966472', 'step': 3183, 'epoch': 3} {'type': 'loss', 'content': 0.012787654995918274, 'timestamp': '2025-09-04 04:14:09.987350', 'step': 3184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:10.101021', 'step': 3184, 'epoch': 3} {'type': 'loss', 'content': 0.0021711078006774187, 'timestamp': '2025-09-04 04:14:10.122082', 'step': 3185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:14:10.199346', 'step': 3185, 'epoch': 3} {'type': 'loss', 'content': 0.016758840531110764, 'timestamp': '2025-09-04 04:14:10.213436', 'step': 3186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:10.318855', 'step': 3186, 'epoch': 3} {'type': 'loss', 'content': 0.0012713070027530193, 'timestamp': '2025-09-04 04:14:10.338888', 'step': 3187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:10.441584', 'step': 3187, 'epoch': 3} {'type': 'loss', 'content': 0.002686247928068042, 'timestamp': '2025-09-04 04:14:10.461620', 'step': 3188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:14:10.567666', 'step': 3188, 'epoch': 3} {'type': 'loss', 'content': 0.009823828935623169, 'timestamp': '2025-09-04 04:14:10.590298', 'step': 3189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:14:10.673129', 'step': 3189, 'epoch': 3} {'type': 'loss', 'content': 0.011689892038702965, 'timestamp': '2025-09-04 04:14:10.688286', 'step': 3190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:14:10.771771', 'step': 3190, 'epoch': 3} {'type': 'loss', 'content': 0.0027343537658452988, 'timestamp': '2025-09-04 04:14:10.787075', 'step': 3191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:10.886962', 'step': 3191, 'epoch': 3} {'type': 'loss', 'content': 0.008726535364985466, 'timestamp': '2025-09-04 04:14:10.906597', 'step': 3192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:11.007668', 'step': 3192, 'epoch': 3} {'type': 'loss', 'content': 0.006816827226430178, 'timestamp': '2025-09-04 04:14:11.028767', 'step': 3193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:14:11.141248', 'step': 3193, 'epoch': 3} {'type': 'loss', 'content': 0.00433404790237546, 'timestamp': '2025-09-04 04:14:11.161661', 'step': 3194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:14:11.271042', 'step': 3194, 'epoch': 3} {'type': 'loss', 'content': 0.005224962718784809, 'timestamp': '2025-09-04 04:14:11.291628', 'step': 3195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:11.402317', 'step': 3195, 'epoch': 3} {'type': 'loss', 'content': 0.00549314683303237, 'timestamp': '2025-09-04 04:14:11.421990', 'step': 3196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:11.512659', 'step': 3196, 'epoch': 3} {'type': 'loss', 'content': 0.015487665310502052, 'timestamp': '2025-09-04 04:14:11.531859', 'step': 3197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:11.627288', 'step': 3197, 'epoch': 3} {'type': 'loss', 'content': 0.005331105552613735, 'timestamp': '2025-09-04 04:14:11.644713', 'step': 3198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:11.745595', 'step': 3198, 'epoch': 3} {'type': 'loss', 'content': 0.0038703870959579945, 'timestamp': '2025-09-04 04:14:11.764490', 'step': 3199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:14:11.841979', 'step': 3199, 'epoch': 3} {'type': 'loss', 'content': 0.013186764903366566, 'timestamp': '2025-09-04 04:14:11.856696', 'step': 3200, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:14:20.282753', 'step': 3200, 'epoch': 3} {'type': 'pplx', 'content': 298.56920811002476, 'timestamp': '2025-09-04 04:14:20.284562', 'step': 3200, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3200', 'timestamp': '2025-09-04 04:14:20.791042', 'step': 3200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:14:20.863498', 'step': 3200, 'epoch': 3} {'type': 'loss', 'content': 0.027715997770428658, 'timestamp': '2025-09-04 04:14:20.878154', 'step': 3201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:20.982383', 'step': 3201, 'epoch': 3} {'type': 'loss', 'content': 0.0028067301027476788, 'timestamp': '2025-09-04 04:14:21.001527', 'step': 3202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:21.100913', 'step': 3202, 'epoch': 3} {'type': 'loss', 'content': 0.0029784520156681538, 'timestamp': '2025-09-04 04:14:21.119598', 'step': 3203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:21.215903', 'step': 3203, 'epoch': 3} {'type': 'loss', 'content': 0.0007651972700841725, 'timestamp': '2025-09-04 04:14:21.234311', 'step': 3204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:21.335668', 'step': 3204, 'epoch': 3} {'type': 'loss', 'content': 0.03393160179257393, 'timestamp': '2025-09-04 04:14:21.356658', 'step': 3205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:21.452878', 'step': 3205, 'epoch': 3} {'type': 'loss', 'content': 0.0033680200576782227, 'timestamp': '2025-09-04 04:14:21.470525', 'step': 3206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:14:21.556580', 'step': 3206, 'epoch': 3} {'type': 'loss', 'content': 0.022258544340729713, 'timestamp': '2025-09-04 04:14:21.572030', 'step': 3207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:21.666815', 'step': 3207, 'epoch': 3} {'type': 'loss', 'content': 0.000977307092398405, 'timestamp': '2025-09-04 04:14:21.685202', 'step': 3208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:14:21.802759', 'step': 3208, 'epoch': 3} {'type': 'loss', 'content': 0.009067521430552006, 'timestamp': '2025-09-04 04:14:21.826562', 'step': 3209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:21.930639', 'step': 3209, 'epoch': 3} {'type': 'loss', 'content': 0.005365348886698484, 'timestamp': '2025-09-04 04:14:21.949872', 'step': 3210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:22.049801', 'step': 3210, 'epoch': 3} {'type': 'loss', 'content': 0.016756800934672356, 'timestamp': '2025-09-04 04:14:22.068544', 'step': 3211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:14:22.152516', 'step': 3211, 'epoch': 3} {'type': 'loss', 'content': 0.0015038455603644252, 'timestamp': '2025-09-04 04:14:22.167154', 'step': 3212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:14:22.248469', 'step': 3212, 'epoch': 3} {'type': 'loss', 'content': 0.002096136100590229, 'timestamp': '2025-09-04 04:14:22.265055', 'step': 3213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:14:22.347176', 'step': 3213, 'epoch': 3} {'type': 'loss', 'content': 0.006991859059780836, 'timestamp': '2025-09-04 04:14:22.362368', 'step': 3214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:22.465008', 'step': 3214, 'epoch': 3} {'type': 'loss', 'content': 0.004486733116209507, 'timestamp': '2025-09-04 04:14:22.484327', 'step': 3215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:14:22.574918', 'step': 3215, 'epoch': 3} {'type': 'loss', 'content': 0.015569731593132019, 'timestamp': '2025-09-04 04:14:22.592568', 'step': 3216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:22.690066', 'step': 3216, 'epoch': 3} {'type': 'loss', 'content': 0.02023530937731266, 'timestamp': '2025-09-04 04:14:22.710895', 'step': 3217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:14:22.834269', 'step': 3217, 'epoch': 3} {'type': 'loss', 'content': 0.009704073891043663, 'timestamp': '2025-09-04 04:14:22.857521', 'step': 3218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:22.960755', 'step': 3218, 'epoch': 3} {'type': 'loss', 'content': 0.00012312929902691394, 'timestamp': '2025-09-04 04:14:22.980128', 'step': 3219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:14:23.101629', 'step': 3219, 'epoch': 3} {'type': 'loss', 'content': 0.0024548498913645744, 'timestamp': '2025-09-04 04:14:23.125454', 'step': 3220, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:14:31.604710', 'step': 3220, 'epoch': 3} {'type': 'pplx', 'content': 302.84948080396117, 'timestamp': '2025-09-04 04:14:31.606931', 'step': 3220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:14:31.688935', 'step': 3220, 'epoch': 3} {'type': 'loss', 'content': 0.03970217704772949, 'timestamp': '2025-09-04 04:14:31.706195', 'step': 3221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:31.806815', 'step': 3221, 'epoch': 3} {'type': 'loss', 'content': 0.0249653160572052, 'timestamp': '2025-09-04 04:14:31.825636', 'step': 3222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:14:31.903292', 'step': 3222, 'epoch': 3} {'type': 'loss', 'content': 0.00563463568687439, 'timestamp': '2025-09-04 04:14:31.917510', 'step': 3223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:14:32.010566', 'step': 3223, 'epoch': 3} {'type': 'loss', 'content': 0.012554388493299484, 'timestamp': '2025-09-04 04:14:32.028457', 'step': 3224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:14:32.118821', 'step': 3224, 'epoch': 3} {'type': 'loss', 'content': 0.045332036912441254, 'timestamp': '2025-09-04 04:14:32.137174', 'step': 3225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:32.238227', 'step': 3225, 'epoch': 3} {'type': 'loss', 'content': 0.0016650618053972721, 'timestamp': '2025-09-04 04:14:32.257069', 'step': 3226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:32.350727', 'step': 3226, 'epoch': 3} {'type': 'loss', 'content': 0.0041397614404559135, 'timestamp': '2025-09-04 04:14:32.368149', 'step': 3227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:32.462491', 'step': 3227, 'epoch': 3} {'type': 'loss', 'content': 0.0027826486621052027, 'timestamp': '2025-09-04 04:14:32.480741', 'step': 3228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:14:32.568652', 'step': 3228, 'epoch': 3} {'type': 'loss', 'content': 0.007442697882652283, 'timestamp': '2025-09-04 04:14:32.587093', 'step': 3229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:14:32.665351', 'step': 3229, 'epoch': 3} {'type': 'loss', 'content': 0.006308171898126602, 'timestamp': '2025-09-04 04:14:32.679438', 'step': 3230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:32.772202', 'step': 3230, 'epoch': 3} {'type': 'loss', 'content': 0.0004488340055104345, 'timestamp': '2025-09-04 04:14:32.789529', 'step': 3231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:14:32.866260', 'step': 3231, 'epoch': 3} {'type': 'loss', 'content': 0.0021919619757682085, 'timestamp': '2025-09-04 04:14:32.881055', 'step': 3232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:14:32.971578', 'step': 3232, 'epoch': 3} {'type': 'loss', 'content': 0.007420377805829048, 'timestamp': '2025-09-04 04:14:32.990460', 'step': 3233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:33.092494', 'step': 3233, 'epoch': 3} {'type': 'loss', 'content': 0.002644237130880356, 'timestamp': '2025-09-04 04:14:33.111654', 'step': 3234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:14:33.247913', 'step': 3234, 'epoch': 3} {'type': 'loss', 'content': 0.0011826629051938653, 'timestamp': '2025-09-04 04:14:33.274252', 'step': 3235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:33.375785', 'step': 3235, 'epoch': 3} {'type': 'loss', 'content': 0.02761908806860447, 'timestamp': '2025-09-04 04:14:33.395525', 'step': 3236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:33.497828', 'step': 3236, 'epoch': 3} {'type': 'loss', 'content': 0.06943929940462112, 'timestamp': '2025-09-04 04:14:33.519104', 'step': 3237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:33.618795', 'step': 3237, 'epoch': 3} {'type': 'loss', 'content': 0.012753068469464779, 'timestamp': '2025-09-04 04:14:33.637467', 'step': 3238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:33.741847', 'step': 3238, 'epoch': 3} {'type': 'loss', 'content': 0.008138732053339481, 'timestamp': '2025-09-04 04:14:33.761229', 'step': 3239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:33.864802', 'step': 3239, 'epoch': 3} {'type': 'loss', 'content': 0.0004093741299584508, 'timestamp': '2025-09-04 04:14:33.884886', 'step': 3240, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:14:42.381848', 'step': 3240, 'epoch': 3} {'type': 'pplx', 'content': 302.56651160009335, 'timestamp': '2025-09-04 04:14:42.383805', 'step': 3240, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3240', 'timestamp': '2025-09-04 04:14:42.856128', 'step': 3240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:14:42.923762', 'step': 3240, 'epoch': 3} {'type': 'loss', 'content': 0.003945726901292801, 'timestamp': '2025-09-04 04:14:42.937279', 'step': 3241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:14:43.015815', 'step': 3241, 'epoch': 3} {'type': 'loss', 'content': 0.0013066886458545923, 'timestamp': '2025-09-04 04:14:43.030023', 'step': 3242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:43.128688', 'step': 3242, 'epoch': 3} {'type': 'loss', 'content': 0.005977214314043522, 'timestamp': '2025-09-04 04:14:43.147386', 'step': 3243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:43.247664', 'step': 3243, 'epoch': 3} {'type': 'loss', 'content': 0.006708620116114616, 'timestamp': '2025-09-04 04:14:43.267330', 'step': 3244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:43.366677', 'step': 3244, 'epoch': 3} {'type': 'loss', 'content': 0.010317733511328697, 'timestamp': '2025-09-04 04:14:43.387825', 'step': 3245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:14:43.497352', 'step': 3245, 'epoch': 3} {'type': 'loss', 'content': 0.001994991209357977, 'timestamp': '2025-09-04 04:14:43.518042', 'step': 3246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:43.613568', 'step': 3246, 'epoch': 3} {'type': 'loss', 'content': 0.0030987162608653307, 'timestamp': '2025-09-04 04:14:43.631222', 'step': 3247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:14:43.740095', 'step': 3247, 'epoch': 3} {'type': 'loss', 'content': 0.0005180987645871937, 'timestamp': '2025-09-04 04:14:43.761304', 'step': 3248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:43.852971', 'step': 3248, 'epoch': 3} {'type': 'loss', 'content': 0.00039079232374206185, 'timestamp': '2025-09-04 04:14:43.872164', 'step': 3249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:43.966444', 'step': 3249, 'epoch': 3} {'type': 'loss', 'content': 0.00027972026146017015, 'timestamp': '2025-09-04 04:14:43.984013', 'step': 3250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:14:44.093314', 'step': 3250, 'epoch': 3} {'type': 'loss', 'content': 0.0020345128141343594, 'timestamp': '2025-09-04 04:14:44.113985', 'step': 3251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:44.214840', 'step': 3251, 'epoch': 3} {'type': 'loss', 'content': 0.008928967639803886, 'timestamp': '2025-09-04 04:14:44.234614', 'step': 3252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:44.326751', 'step': 3252, 'epoch': 3} {'type': 'loss', 'content': 0.0039917477406561375, 'timestamp': '2025-09-04 04:14:44.346001', 'step': 3253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:14:44.448569', 'step': 3253, 'epoch': 3} {'type': 'loss', 'content': 0.0008001399110071361, 'timestamp': '2025-09-04 04:14:44.467465', 'step': 3254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:44.562224', 'step': 3254, 'epoch': 3} {'type': 'loss', 'content': 0.0018813287606462836, 'timestamp': '2025-09-04 04:14:44.579611', 'step': 3255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:14:44.665229', 'step': 3255, 'epoch': 3} {'type': 'loss', 'content': 0.0071727619506418705, 'timestamp': '2025-09-04 04:14:44.681606', 'step': 3256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:14:44.755422', 'step': 3256, 'epoch': 3} {'type': 'loss', 'content': 0.019573703408241272, 'timestamp': '2025-09-04 04:14:44.770563', 'step': 3257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:14:44.879668', 'step': 3257, 'epoch': 3} {'type': 'loss', 'content': 0.010637468658387661, 'timestamp': '2025-09-04 04:14:44.900200', 'step': 3258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:44.995225', 'step': 3258, 'epoch': 3} {'type': 'loss', 'content': 0.0005085250595584512, 'timestamp': '2025-09-04 04:14:45.012783', 'step': 3259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:45.118189', 'step': 3259, 'epoch': 3} {'type': 'loss', 'content': 0.004459428135305643, 'timestamp': '2025-09-04 04:14:45.139052', 'step': 3260, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:14:53.619951', 'step': 3260, 'epoch': 3} {'type': 'pplx', 'content': 303.6349730675357, 'timestamp': '2025-09-04 04:14:53.621797', 'step': 3260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:53.720593', 'step': 3260, 'epoch': 3} {'type': 'loss', 'content': 0.007222423795610666, 'timestamp': '2025-09-04 04:14:53.741702', 'step': 3261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:14:53.833132', 'step': 3261, 'epoch': 3} {'type': 'loss', 'content': 0.0010594201739877462, 'timestamp': '2025-09-04 04:14:53.849926', 'step': 3262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:53.949469', 'step': 3262, 'epoch': 3} {'type': 'loss', 'content': 0.002619499806314707, 'timestamp': '2025-09-04 04:14:53.968160', 'step': 3263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:14:54.069273', 'step': 3263, 'epoch': 3} {'type': 'loss', 'content': 0.002300729276612401, 'timestamp': '2025-09-04 04:14:54.088775', 'step': 3264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:14:54.193743', 'step': 3264, 'epoch': 3} {'type': 'loss', 'content': 0.0029513502959161997, 'timestamp': '2025-09-04 04:14:54.215967', 'step': 3265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:54.311347', 'step': 3265, 'epoch': 3} {'type': 'loss', 'content': 0.0033043273724615574, 'timestamp': '2025-09-04 04:14:54.328764', 'step': 3266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:14:54.423680', 'step': 3266, 'epoch': 3} {'type': 'loss', 'content': 0.0076232897117733955, 'timestamp': '2025-09-04 04:14:54.441064', 'step': 3267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:14:54.549806', 'step': 3267, 'epoch': 3} {'type': 'loss', 'content': 0.012323771603405476, 'timestamp': '2025-09-04 04:14:54.570969', 'step': 3268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:54.677234', 'step': 3268, 'epoch': 3} {'type': 'loss', 'content': 0.014858097769320011, 'timestamp': '2025-09-04 04:14:54.699205', 'step': 3269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:14:54.771795', 'step': 3269, 'epoch': 3} {'type': 'loss', 'content': 0.006694823037832975, 'timestamp': '2025-09-04 04:14:54.784684', 'step': 3270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:14:54.870959', 'step': 3270, 'epoch': 3} {'type': 'loss', 'content': 0.0045067863538861275, 'timestamp': '2025-09-04 04:14:54.886529', 'step': 3271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:54.994351', 'step': 3271, 'epoch': 3} {'type': 'loss', 'content': 0.0029527172446250916, 'timestamp': '2025-09-04 04:14:55.015092', 'step': 3272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:14:55.106218', 'step': 3272, 'epoch': 3} {'type': 'loss', 'content': 0.010578243993222713, 'timestamp': '2025-09-04 04:14:55.124807', 'step': 3273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:14:55.217610', 'step': 3273, 'epoch': 3} {'type': 'loss', 'content': 0.004275831393897533, 'timestamp': '2025-09-04 04:14:55.234893', 'step': 3274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:14:55.338811', 'step': 3274, 'epoch': 3} {'type': 'loss', 'content': 0.004477000795304775, 'timestamp': '2025-09-04 04:14:55.358098', 'step': 3275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:14:55.454527', 'step': 3275, 'epoch': 3} {'type': 'loss', 'content': 0.003879428841173649, 'timestamp': '2025-09-04 04:14:55.472942', 'step': 3276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:14:55.577939', 'step': 3276, 'epoch': 3} {'type': 'loss', 'content': 0.06313685327768326, 'timestamp': '2025-09-04 04:14:55.599965', 'step': 3277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 04:14:55.763310', 'step': 3277, 'epoch': 3} {'type': 'loss', 'content': 0.002089696703478694, 'timestamp': '2025-09-04 04:14:55.795506', 'step': 3278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:14:55.880590', 'step': 3278, 'epoch': 3} {'type': 'loss', 'content': 0.0007018198375590146, 'timestamp': '2025-09-04 04:14:55.895788', 'step': 3279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:14:55.998173', 'step': 3279, 'epoch': 3} {'type': 'loss', 'content': 0.02527954801917076, 'timestamp': '2025-09-04 04:14:56.018237', 'step': 3280, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:04.467066', 'step': 3280, 'epoch': 3} {'type': 'pplx', 'content': 306.23200739000436, 'timestamp': '2025-09-04 04:15:04.469448', 'step': 3280, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3280', 'timestamp': '2025-09-04 04:15:04.982903', 'step': 3280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:15:05.079346', 'step': 3280, 'epoch': 3} {'type': 'loss', 'content': 0.03897909075021744, 'timestamp': '2025-09-04 04:15:05.099648', 'step': 3281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:05.202261', 'step': 3281, 'epoch': 3} {'type': 'loss', 'content': 0.0016802679747343063, 'timestamp': '2025-09-04 04:15:05.221465', 'step': 3282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:05.329841', 'step': 3282, 'epoch': 3} {'type': 'loss', 'content': 0.002108318265527487, 'timestamp': '2025-09-04 04:15:05.349197', 'step': 3283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:05.452573', 'step': 3283, 'epoch': 3} {'type': 'loss', 'content': 0.00026367095415480435, 'timestamp': '2025-09-04 04:15:05.472578', 'step': 3284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:15:05.563126', 'step': 3284, 'epoch': 3} {'type': 'loss', 'content': 0.0026424103416502476, 'timestamp': '2025-09-04 04:15:05.581936', 'step': 3285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:15:05.699833', 'step': 3285, 'epoch': 3} {'type': 'loss', 'content': 0.00553141999989748, 'timestamp': '2025-09-04 04:15:05.721891', 'step': 3286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:05.814069', 'step': 3286, 'epoch': 3} {'type': 'loss', 'content': 0.009483684785664082, 'timestamp': '2025-09-04 04:15:05.830807', 'step': 3287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:15:05.926426', 'step': 3287, 'epoch': 3} {'type': 'loss', 'content': 0.0044782888144254684, 'timestamp': '2025-09-04 04:15:05.944699', 'step': 3288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:15:06.061273', 'step': 3288, 'epoch': 3} {'type': 'loss', 'content': 0.0009773524943739176, 'timestamp': '2025-09-04 04:15:06.085178', 'step': 3289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:06.192109', 'step': 3289, 'epoch': 3} {'type': 'loss', 'content': 0.00231315684504807, 'timestamp': '2025-09-04 04:15:06.212110', 'step': 3290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:06.305889', 'step': 3290, 'epoch': 3} {'type': 'loss', 'content': 0.0034602778032422066, 'timestamp': '2025-09-04 04:15:06.320028', 'step': 3291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:15:06.402443', 'step': 3291, 'epoch': 3} {'type': 'loss', 'content': 0.005539227742701769, 'timestamp': '2025-09-04 04:15:06.418345', 'step': 3292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:15:06.498804', 'step': 3292, 'epoch': 3} {'type': 'loss', 'content': 0.0033372659236192703, 'timestamp': '2025-09-04 04:15:06.515338', 'step': 3293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:06.635892', 'step': 3293, 'epoch': 3} {'type': 'loss', 'content': 0.0007293337839655578, 'timestamp': '2025-09-04 04:15:06.655876', 'step': 3294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:15:06.791530', 'step': 3294, 'epoch': 3} {'type': 'loss', 'content': 0.0006761676049791276, 'timestamp': '2025-09-04 04:15:06.817475', 'step': 3295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:15:06.904333', 'step': 3295, 'epoch': 3} {'type': 'loss', 'content': 0.032214339822530746, 'timestamp': '2025-09-04 04:15:06.920789', 'step': 3296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:15:06.994544', 'step': 3296, 'epoch': 3} {'type': 'loss', 'content': 0.004622759763151407, 'timestamp': '2025-09-04 04:15:07.009354', 'step': 3297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 448], 'flops': 8960054460160.0}, 'timestamp': '2025-09-04 04:15:07.081752', 'step': 3297, 'epoch': 3} {'type': 'loss', 'content': 0.00865192525088787, 'timestamp': '2025-09-04 04:15:07.094680', 'step': 3298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:15:07.190603', 'step': 3298, 'epoch': 3} {'type': 'loss', 'content': 0.004962395876646042, 'timestamp': '2025-09-04 04:15:07.208103', 'step': 3299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:15:07.306707', 'step': 3299, 'epoch': 3} {'type': 'loss', 'content': 0.004638133570551872, 'timestamp': '2025-09-04 04:15:07.324605', 'step': 3300, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:15.708785', 'step': 3300, 'epoch': 3} {'type': 'pplx', 'content': 310.93065235576375, 'timestamp': '2025-09-04 04:15:15.710762', 'step': 3300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:15.809849', 'step': 3300, 'epoch': 3} {'type': 'loss', 'content': 0.00043890104279853404, 'timestamp': '2025-09-04 04:15:15.831027', 'step': 3301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:15.936809', 'step': 3301, 'epoch': 3} {'type': 'loss', 'content': 0.031615160405635834, 'timestamp': '2025-09-04 04:15:15.956837', 'step': 3302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:16.034876', 'step': 3302, 'epoch': 3} {'type': 'loss', 'content': 0.012209202162921429, 'timestamp': '2025-09-04 04:15:16.048981', 'step': 3303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:16.139795', 'step': 3303, 'epoch': 3} {'type': 'loss', 'content': 0.01653936877846718, 'timestamp': '2025-09-04 04:15:16.157321', 'step': 3304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:16.255858', 'step': 3304, 'epoch': 3} {'type': 'loss', 'content': 0.016090011224150658, 'timestamp': '2025-09-04 04:15:16.276637', 'step': 3305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:16.379531', 'step': 3305, 'epoch': 3} {'type': 'loss', 'content': 0.0072241052985191345, 'timestamp': '2025-09-04 04:15:16.398754', 'step': 3306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:16.499129', 'step': 3306, 'epoch': 3} {'type': 'loss', 'content': 0.0002566951443441212, 'timestamp': '2025-09-04 04:15:16.517889', 'step': 3307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:16.596145', 'step': 3307, 'epoch': 3} {'type': 'loss', 'content': 0.0032684989273548126, 'timestamp': '2025-09-04 04:15:16.610992', 'step': 3308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:15:16.685266', 'step': 3308, 'epoch': 3} {'type': 'loss', 'content': 0.0016026400262489915, 'timestamp': '2025-09-04 04:15:16.700263', 'step': 3309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:16.807189', 'step': 3309, 'epoch': 3} {'type': 'loss', 'content': 0.002181933494284749, 'timestamp': '2025-09-04 04:15:16.827297', 'step': 3310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:16.937081', 'step': 3310, 'epoch': 3} {'type': 'loss', 'content': 0.002524265320971608, 'timestamp': '2025-09-04 04:15:16.957733', 'step': 3311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:17.068037', 'step': 3311, 'epoch': 3} {'type': 'loss', 'content': 0.0015753493644297123, 'timestamp': '2025-09-04 04:15:17.089325', 'step': 3312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:15:17.173509', 'step': 3312, 'epoch': 3} {'type': 'loss', 'content': 0.010916945524513721, 'timestamp': '2025-09-04 04:15:17.190442', 'step': 3313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:17.300267', 'step': 3313, 'epoch': 3} {'type': 'loss', 'content': 0.002287115901708603, 'timestamp': '2025-09-04 04:15:17.320788', 'step': 3314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:17.411659', 'step': 3314, 'epoch': 3} {'type': 'loss', 'content': 0.02453738823533058, 'timestamp': '2025-09-04 04:15:17.428428', 'step': 3315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:15:17.526468', 'step': 3315, 'epoch': 3} {'type': 'loss', 'content': 0.0024574180133640766, 'timestamp': '2025-09-04 04:15:17.545929', 'step': 3316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:17.634340', 'step': 3316, 'epoch': 3} {'type': 'loss', 'content': 0.003129773773252964, 'timestamp': '2025-09-04 04:15:17.652692', 'step': 3317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:15:17.730020', 'step': 3317, 'epoch': 3} {'type': 'loss', 'content': 0.0017890299204736948, 'timestamp': '2025-09-04 04:15:17.744156', 'step': 3318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:17.846641', 'step': 3318, 'epoch': 3} {'type': 'loss', 'content': 0.0003455560654401779, 'timestamp': '2025-09-04 04:15:17.865764', 'step': 3319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:17.955724', 'step': 3319, 'epoch': 3} {'type': 'loss', 'content': 0.028852151706814766, 'timestamp': '2025-09-04 04:15:17.973313', 'step': 3320, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:26.366823', 'step': 3320, 'epoch': 3} {'type': 'pplx', 'content': 315.5775818717673, 'timestamp': '2025-09-04 04:15:26.368541', 'step': 3320, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3320', 'timestamp': '2025-09-04 04:15:26.717810', 'step': 3320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:15:26.835668', 'step': 3320, 'epoch': 3} {'type': 'loss', 'content': 0.0002499267866369337, 'timestamp': '2025-09-04 04:15:26.860977', 'step': 3321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:15:26.945054', 'step': 3321, 'epoch': 3} {'type': 'loss', 'content': 0.0008101621060632169, 'timestamp': '2025-09-04 04:15:26.960654', 'step': 3322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:15:27.054780', 'step': 3322, 'epoch': 3} {'type': 'loss', 'content': 0.012700149789452553, 'timestamp': '2025-09-04 04:15:27.072188', 'step': 3323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:15:27.172098', 'step': 3323, 'epoch': 3} {'type': 'loss', 'content': 0.007206479553133249, 'timestamp': '2025-09-04 04:15:27.191492', 'step': 3324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:27.268978', 'step': 3324, 'epoch': 3} {'type': 'loss', 'content': 0.013278383761644363, 'timestamp': '2025-09-04 04:15:27.284463', 'step': 3325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:27.395471', 'step': 3325, 'epoch': 3} {'type': 'loss', 'content': 0.004693718161433935, 'timestamp': '2025-09-04 04:15:27.414655', 'step': 3326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:15:27.490883', 'step': 3326, 'epoch': 3} {'type': 'loss', 'content': 0.029808765277266502, 'timestamp': '2025-09-04 04:15:27.504664', 'step': 3327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:27.608290', 'step': 3327, 'epoch': 3} {'type': 'loss', 'content': 0.00016743317246437073, 'timestamp': '2025-09-04 04:15:27.628357', 'step': 3328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:15:27.703701', 'step': 3328, 'epoch': 3} {'type': 'loss', 'content': 0.018314022570848465, 'timestamp': '2025-09-04 04:15:27.718997', 'step': 3329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:27.828684', 'step': 3329, 'epoch': 3} {'type': 'loss', 'content': 0.0024994483683258295, 'timestamp': '2025-09-04 04:15:27.849182', 'step': 3330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:15:27.935748', 'step': 3330, 'epoch': 3} {'type': 'loss', 'content': 0.00011107311001978815, 'timestamp': '2025-09-04 04:15:27.951411', 'step': 3331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 848], 'flops': 16960103024960.0}, 'timestamp': '2025-09-04 04:15:28.077014', 'step': 3331, 'epoch': 3} {'type': 'loss', 'content': 0.010431556962430477, 'timestamp': '2025-09-04 04:15:28.101871', 'step': 3332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:15:28.193747', 'step': 3332, 'epoch': 3} {'type': 'loss', 'content': 0.03157045319676399, 'timestamp': '2025-09-04 04:15:28.212571', 'step': 3333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:15:28.296301', 'step': 3333, 'epoch': 3} {'type': 'loss', 'content': 0.04083694517612457, 'timestamp': '2025-09-04 04:15:28.311522', 'step': 3334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:15:28.405898', 'step': 3334, 'epoch': 3} {'type': 'loss', 'content': 0.0034233976621180773, 'timestamp': '2025-09-04 04:15:28.423349', 'step': 3335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:28.532837', 'step': 3335, 'epoch': 3} {'type': 'loss', 'content': 0.021260175853967667, 'timestamp': '2025-09-04 04:15:28.554152', 'step': 3336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:28.651272', 'step': 3336, 'epoch': 3} {'type': 'loss', 'content': 0.015738222748041153, 'timestamp': '2025-09-04 04:15:28.671979', 'step': 3337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:28.775677', 'step': 3337, 'epoch': 3} {'type': 'loss', 'content': 0.007692721672356129, 'timestamp': '2025-09-04 04:15:28.794901', 'step': 3338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:28.896271', 'step': 3338, 'epoch': 3} {'type': 'loss', 'content': 0.0014377308543771505, 'timestamp': '2025-09-04 04:15:28.915136', 'step': 3339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:28.993680', 'step': 3339, 'epoch': 3} {'type': 'loss', 'content': 0.016688672825694084, 'timestamp': '2025-09-04 04:15:29.008612', 'step': 3340, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:37.370468', 'step': 3340, 'epoch': 3} {'type': 'pplx', 'content': 312.33404172021034, 'timestamp': '2025-09-04 04:15:37.372516', 'step': 3340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:37.473968', 'step': 3340, 'epoch': 3} {'type': 'loss', 'content': 0.011500056833028793, 'timestamp': '2025-09-04 04:15:37.495856', 'step': 3341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:37.606004', 'step': 3341, 'epoch': 3} {'type': 'loss', 'content': 0.0288741085678339, 'timestamp': '2025-09-04 04:15:37.626404', 'step': 3342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:37.728252', 'step': 3342, 'epoch': 3} {'type': 'loss', 'content': 0.004480296280235052, 'timestamp': '2025-09-04 04:15:37.747168', 'step': 3343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:15:37.842312', 'step': 3343, 'epoch': 3} {'type': 'loss', 'content': 0.002481586765497923, 'timestamp': '2025-09-04 04:15:37.860519', 'step': 3344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:37.949491', 'step': 3344, 'epoch': 3} {'type': 'loss', 'content': 0.013428304344415665, 'timestamp': '2025-09-04 04:15:37.967921', 'step': 3345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:15:38.045038', 'step': 3345, 'epoch': 3} {'type': 'loss', 'content': 0.0397348590195179, 'timestamp': '2025-09-04 04:15:38.058674', 'step': 3346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:15:38.166347', 'step': 3346, 'epoch': 3} {'type': 'loss', 'content': 0.0024191653355956078, 'timestamp': '2025-09-04 04:15:38.186670', 'step': 3347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:15:38.302749', 'step': 3347, 'epoch': 3} {'type': 'loss', 'content': 0.0025882297195494175, 'timestamp': '2025-09-04 04:15:38.325668', 'step': 3348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:15:38.438407', 'step': 3348, 'epoch': 3} {'type': 'loss', 'content': 0.002427577506750822, 'timestamp': '2025-09-04 04:15:38.461108', 'step': 3349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:15:38.539248', 'step': 3349, 'epoch': 3} {'type': 'loss', 'content': 0.003565673716366291, 'timestamp': '2025-09-04 04:15:38.553261', 'step': 3350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:38.643791', 'step': 3350, 'epoch': 3} {'type': 'loss', 'content': 0.0014742841012775898, 'timestamp': '2025-09-04 04:15:38.660494', 'step': 3351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:15:38.770850', 'step': 3351, 'epoch': 3} {'type': 'loss', 'content': 0.0006036367267370224, 'timestamp': '2025-09-04 04:15:38.792263', 'step': 3352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:38.889981', 'step': 3352, 'epoch': 3} {'type': 'loss', 'content': 0.023486964404582977, 'timestamp': '2025-09-04 04:15:38.910660', 'step': 3353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:15:39.018492', 'step': 3353, 'epoch': 3} {'type': 'loss', 'content': 0.008537087589502335, 'timestamp': '2025-09-04 04:15:39.038759', 'step': 3354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:15:39.122884', 'step': 3354, 'epoch': 3} {'type': 'loss', 'content': 0.0004986139247193933, 'timestamp': '2025-09-04 04:15:39.138020', 'step': 3355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:39.245800', 'step': 3355, 'epoch': 3} {'type': 'loss', 'content': 0.03504369035363197, 'timestamp': '2025-09-04 04:15:39.266945', 'step': 3356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:15:39.343131', 'step': 3356, 'epoch': 3} {'type': 'loss', 'content': 0.017468314617872238, 'timestamp': '2025-09-04 04:15:39.358533', 'step': 3357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:39.461610', 'step': 3357, 'epoch': 3} {'type': 'loss', 'content': 0.005034047178924084, 'timestamp': '2025-09-04 04:15:39.480862', 'step': 3358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:15:39.566712', 'step': 3358, 'epoch': 3} {'type': 'loss', 'content': 0.031177129596471786, 'timestamp': '2025-09-04 04:15:39.582311', 'step': 3359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:15:39.684803', 'step': 3359, 'epoch': 3} {'type': 'loss', 'content': 0.0004434007278177887, 'timestamp': '2025-09-04 04:15:39.704763', 'step': 3360, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:48.145682', 'step': 3360, 'epoch': 3} {'type': 'pplx', 'content': 301.0731658055851, 'timestamp': '2025-09-04 04:15:48.147942', 'step': 3360, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3360', 'timestamp': '2025-09-04 04:15:48.687287', 'step': 3360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:15:48.767367', 'step': 3360, 'epoch': 3} {'type': 'loss', 'content': 0.0026787512470036745, 'timestamp': '2025-09-04 04:15:48.783668', 'step': 3361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:48.896544', 'step': 3361, 'epoch': 3} {'type': 'loss', 'content': 0.05144086107611656, 'timestamp': '2025-09-04 04:15:48.916900', 'step': 3362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:15:49.012988', 'step': 3362, 'epoch': 3} {'type': 'loss', 'content': 0.00442865677177906, 'timestamp': '2025-09-04 04:15:49.030271', 'step': 3363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:49.131246', 'step': 3363, 'epoch': 3} {'type': 'loss', 'content': 0.0024153843987733126, 'timestamp': '2025-09-04 04:15:49.150792', 'step': 3364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:15:49.247227', 'step': 3364, 'epoch': 3} {'type': 'loss', 'content': 0.00037681308458559215, 'timestamp': '2025-09-04 04:15:49.267476', 'step': 3365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:15:49.361875', 'step': 3365, 'epoch': 3} {'type': 'loss', 'content': 0.004885226022452116, 'timestamp': '2025-09-04 04:15:49.378943', 'step': 3366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:49.479578', 'step': 3366, 'epoch': 3} {'type': 'loss', 'content': 0.020293110981583595, 'timestamp': '2025-09-04 04:15:49.498362', 'step': 3367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:15:49.601224', 'step': 3367, 'epoch': 3} {'type': 'loss', 'content': 0.02302255854010582, 'timestamp': '2025-09-04 04:15:49.620988', 'step': 3368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:15:49.727729', 'step': 3368, 'epoch': 3} {'type': 'loss', 'content': 0.0001401986082782969, 'timestamp': '2025-09-04 04:15:49.750154', 'step': 3369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:15:49.859459', 'step': 3369, 'epoch': 3} {'type': 'loss', 'content': 0.03139295056462288, 'timestamp': '2025-09-04 04:15:49.879856', 'step': 3370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:49.981889', 'step': 3370, 'epoch': 3} {'type': 'loss', 'content': 0.000647324079181999, 'timestamp': '2025-09-04 04:15:50.000762', 'step': 3371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:15:50.071999', 'step': 3371, 'epoch': 3} {'type': 'loss', 'content': 0.020890971645712852, 'timestamp': '2025-09-04 04:15:50.085529', 'step': 3372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:15:50.181131', 'step': 3372, 'epoch': 3} {'type': 'loss', 'content': 0.0002276118320878595, 'timestamp': '2025-09-04 04:15:50.200295', 'step': 3373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:15:50.306852', 'step': 3373, 'epoch': 3} {'type': 'loss', 'content': 0.021090731024742126, 'timestamp': '2025-09-04 04:15:50.326971', 'step': 3374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:15:50.463072', 'step': 3374, 'epoch': 3} {'type': 'loss', 'content': 0.0053464737720787525, 'timestamp': '2025-09-04 04:15:50.489249', 'step': 3375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:15:50.600962', 'step': 3375, 'epoch': 3} {'type': 'loss', 'content': 0.00169714679941535, 'timestamp': '2025-09-04 04:15:50.622394', 'step': 3376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:15:50.705172', 'step': 3376, 'epoch': 3} {'type': 'loss', 'content': 0.0071422443725168705, 'timestamp': '2025-09-04 04:15:50.722252', 'step': 3377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:15:50.812122', 'step': 3377, 'epoch': 3} {'type': 'loss', 'content': 0.0026480015367269516, 'timestamp': '2025-09-04 04:15:50.828896', 'step': 3378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:50.928878', 'step': 3378, 'epoch': 3} {'type': 'loss', 'content': 0.0017498015658929944, 'timestamp': '2025-09-04 04:15:50.947886', 'step': 3379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:51.048104', 'step': 3379, 'epoch': 3} {'type': 'loss', 'content': 0.001030558254569769, 'timestamp': '2025-09-04 04:15:51.067707', 'step': 3380, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:15:59.446548', 'step': 3380, 'epoch': 3} {'type': 'pplx', 'content': 294.73432370067934, 'timestamp': '2025-09-04 04:15:59.448500', 'step': 3380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:15:59.530511', 'step': 3380, 'epoch': 3} {'type': 'loss', 'content': 0.00033817789517343044, 'timestamp': '2025-09-04 04:15:59.547754', 'step': 3381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:15:59.648370', 'step': 3381, 'epoch': 3} {'type': 'loss', 'content': 0.0006461319862864912, 'timestamp': '2025-09-04 04:15:59.667244', 'step': 3382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:15:59.752986', 'step': 3382, 'epoch': 3} {'type': 'loss', 'content': 0.027155397459864616, 'timestamp': '2025-09-04 04:15:59.768515', 'step': 3383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:15:59.866742', 'step': 3383, 'epoch': 3} {'type': 'loss', 'content': 0.07015629857778549, 'timestamp': '2025-09-04 04:15:59.884939', 'step': 3384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:15:59.998615', 'step': 3384, 'epoch': 3} {'type': 'loss', 'content': 0.0016538852360099554, 'timestamp': '2025-09-04 04:16:00.022906', 'step': 3385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:00.122185', 'step': 3385, 'epoch': 3} {'type': 'loss', 'content': 0.11791013926267624, 'timestamp': '2025-09-04 04:16:00.140724', 'step': 3386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:16:00.228744', 'step': 3386, 'epoch': 3} {'type': 'loss', 'content': 0.014148048125207424, 'timestamp': '2025-09-04 04:16:00.244329', 'step': 3387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:16:00.365436', 'step': 3387, 'epoch': 3} {'type': 'loss', 'content': 0.0005752384895458817, 'timestamp': '2025-09-04 04:16:00.389346', 'step': 3388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:00.490559', 'step': 3388, 'epoch': 3} {'type': 'loss', 'content': 0.0020116129890084267, 'timestamp': '2025-09-04 04:16:00.511811', 'step': 3389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:00.612654', 'step': 3389, 'epoch': 3} {'type': 'loss', 'content': 0.015090583823621273, 'timestamp': '2025-09-04 04:16:00.631434', 'step': 3390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:16:00.726083', 'step': 3390, 'epoch': 3} {'type': 'loss', 'content': 0.0049343351274728775, 'timestamp': '2025-09-04 04:16:00.743678', 'step': 3391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:00.843939', 'step': 3391, 'epoch': 3} {'type': 'loss', 'content': 0.000529682612977922, 'timestamp': '2025-09-04 04:16:00.863659', 'step': 3392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:16:00.956813', 'step': 3392, 'epoch': 3} {'type': 'loss', 'content': 0.0011004252592101693, 'timestamp': '2025-09-04 04:16:00.976139', 'step': 3393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:01.070353', 'step': 3393, 'epoch': 3} {'type': 'loss', 'content': 0.02100779488682747, 'timestamp': '2025-09-04 04:16:01.087563', 'step': 3394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:01.187918', 'step': 3394, 'epoch': 3} {'type': 'loss', 'content': 0.010034440085291862, 'timestamp': '2025-09-04 04:16:01.206884', 'step': 3395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:01.299913', 'step': 3395, 'epoch': 3} {'type': 'loss', 'content': 0.04920711740851402, 'timestamp': '2025-09-04 04:16:01.317898', 'step': 3396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:01.423832', 'step': 3396, 'epoch': 3} {'type': 'loss', 'content': 0.041153181344270706, 'timestamp': '2025-09-04 04:16:01.445643', 'step': 3397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:01.549800', 'step': 3397, 'epoch': 3} {'type': 'loss', 'content': 0.02985711395740509, 'timestamp': '2025-09-04 04:16:01.569180', 'step': 3398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:01.668606', 'step': 3398, 'epoch': 3} {'type': 'loss', 'content': 0.000561038323212415, 'timestamp': '2025-09-04 04:16:01.687288', 'step': 3399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:01.787984', 'step': 3399, 'epoch': 3} {'type': 'loss', 'content': 0.002825426869094372, 'timestamp': '2025-09-04 04:16:01.807598', 'step': 3400, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:16:10.365512', 'step': 3400, 'epoch': 3} {'type': 'pplx', 'content': 285.49062028489874, 'timestamp': '2025-09-04 04:16:10.371170', 'step': 3400, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3400', 'timestamp': '2025-09-04 04:16:10.757804', 'step': 3400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:10.846129', 'step': 3400, 'epoch': 3} {'type': 'loss', 'content': 0.002441899385303259, 'timestamp': '2025-09-04 04:16:10.864175', 'step': 3401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:16:10.962424', 'step': 3401, 'epoch': 3} {'type': 'loss', 'content': 0.0007989015430212021, 'timestamp': '2025-09-04 04:16:10.979656', 'step': 3402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:11.084061', 'step': 3402, 'epoch': 3} {'type': 'loss', 'content': 0.0034658664371818304, 'timestamp': '2025-09-04 04:16:11.103037', 'step': 3403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:16:11.242472', 'step': 3403, 'epoch': 3} {'type': 'loss', 'content': 0.011140529066324234, 'timestamp': '2025-09-04 04:16:11.269313', 'step': 3404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:11.359769', 'step': 3404, 'epoch': 3} {'type': 'loss', 'content': 0.007334363646805286, 'timestamp': '2025-09-04 04:16:11.377858', 'step': 3405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:11.470040', 'step': 3405, 'epoch': 3} {'type': 'loss', 'content': 0.007528170011937618, 'timestamp': '2025-09-04 04:16:11.486570', 'step': 3406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:11.591013', 'step': 3406, 'epoch': 3} {'type': 'loss', 'content': 0.0002277525927638635, 'timestamp': '2025-09-04 04:16:11.609976', 'step': 3407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:11.702970', 'step': 3407, 'epoch': 3} {'type': 'loss', 'content': 0.0009056737762875855, 'timestamp': '2025-09-04 04:16:11.720292', 'step': 3408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:11.822714', 'step': 3408, 'epoch': 3} {'type': 'loss', 'content': 0.007609906140714884, 'timestamp': '2025-09-04 04:16:11.843614', 'step': 3409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:11.949959', 'step': 3409, 'epoch': 3} {'type': 'loss', 'content': 0.008335032500326633, 'timestamp': '2025-09-04 04:16:11.969001', 'step': 3410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:16:12.048036', 'step': 3410, 'epoch': 3} {'type': 'loss', 'content': 0.0005043984274379909, 'timestamp': '2025-09-04 04:16:12.061607', 'step': 3411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:12.174289', 'step': 3411, 'epoch': 3} {'type': 'loss', 'content': 0.005011504516005516, 'timestamp': '2025-09-04 04:16:12.195379', 'step': 3412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:12.294306', 'step': 3412, 'epoch': 3} {'type': 'loss', 'content': 0.030210444703698158, 'timestamp': '2025-09-04 04:16:12.314552', 'step': 3413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:12.420677', 'step': 3413, 'epoch': 3} {'type': 'loss', 'content': 0.0011688501108437777, 'timestamp': '2025-09-04 04:16:12.439751', 'step': 3414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:12.554307', 'step': 3414, 'epoch': 3} {'type': 'loss', 'content': 0.014814517460763454, 'timestamp': '2025-09-04 04:16:12.573324', 'step': 3415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:12.676000', 'step': 3415, 'epoch': 3} {'type': 'loss', 'content': 0.022736500948667526, 'timestamp': '2025-09-04 04:16:12.695440', 'step': 3416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:12.797542', 'step': 3416, 'epoch': 3} {'type': 'loss', 'content': 0.0009403983131051064, 'timestamp': '2025-09-04 04:16:12.818416', 'step': 3417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:16:12.930686', 'step': 3417, 'epoch': 3} {'type': 'loss', 'content': 0.003918331582099199, 'timestamp': '2025-09-04 04:16:12.951167', 'step': 3418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:16:13.070130', 'step': 3418, 'epoch': 3} {'type': 'loss', 'content': 0.002362527186051011, 'timestamp': '2025-09-04 04:16:13.092009', 'step': 3419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:13.199697', 'step': 3419, 'epoch': 3} {'type': 'loss', 'content': 0.011338985525071621, 'timestamp': '2025-09-04 04:16:13.220218', 'step': 3420, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:16:21.725284', 'step': 3420, 'epoch': 3} {'type': 'pplx', 'content': 281.34383504857846, 'timestamp': '2025-09-04 04:16:21.727582', 'step': 3420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:21.826125', 'step': 3420, 'epoch': 3} {'type': 'loss', 'content': 0.003105961252003908, 'timestamp': '2025-09-04 04:16:21.847337', 'step': 3421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:16:21.932865', 'step': 3421, 'epoch': 3} {'type': 'loss', 'content': 0.0025257884990423918, 'timestamp': '2025-09-04 04:16:21.948610', 'step': 3422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:22.040843', 'step': 3422, 'epoch': 3} {'type': 'loss', 'content': 0.0005112270591780543, 'timestamp': '2025-09-04 04:16:22.057951', 'step': 3423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 04:16:22.241785', 'step': 3423, 'epoch': 3} {'type': 'loss', 'content': 0.012885574251413345, 'timestamp': '2025-09-04 04:16:22.277227', 'step': 3424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:22.368483', 'step': 3424, 'epoch': 3} {'type': 'loss', 'content': 0.007698435802012682, 'timestamp': '2025-09-04 04:16:22.387279', 'step': 3425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:22.496149', 'step': 3425, 'epoch': 3} {'type': 'loss', 'content': 0.0006365908775478601, 'timestamp': '2025-09-04 04:16:22.516680', 'step': 3426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:22.619441', 'step': 3426, 'epoch': 3} {'type': 'loss', 'content': 0.026056919246912003, 'timestamp': '2025-09-04 04:16:22.638651', 'step': 3427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:16:22.722696', 'step': 3427, 'epoch': 3} {'type': 'loss', 'content': 0.0844995528459549, 'timestamp': '2025-09-04 04:16:22.738555', 'step': 3428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:16:22.822651', 'step': 3428, 'epoch': 3} {'type': 'loss', 'content': 0.0028854222036898136, 'timestamp': '2025-09-04 04:16:22.839858', 'step': 3429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:22.929402', 'step': 3429, 'epoch': 3} {'type': 'loss', 'content': 0.000976667390204966, 'timestamp': '2025-09-04 04:16:22.946348', 'step': 3430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:23.044574', 'step': 3430, 'epoch': 3} {'type': 'loss', 'content': 0.019412081688642502, 'timestamp': '2025-09-04 04:16:23.063135', 'step': 3431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:23.162180', 'step': 3431, 'epoch': 3} {'type': 'loss', 'content': 0.0002543810987845063, 'timestamp': '2025-09-04 04:16:23.181583', 'step': 3432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1248], 'flops': 24960151589760.0}, 'timestamp': '2025-09-04 04:16:23.360867', 'step': 3432, 'epoch': 3} {'type': 'loss', 'content': 0.0077868239022791386, 'timestamp': '2025-09-04 04:16:23.398858', 'step': 3433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:23.498788', 'step': 3433, 'epoch': 3} {'type': 'loss', 'content': 0.0018246417166665196, 'timestamp': '2025-09-04 04:16:23.517327', 'step': 3434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:16:23.594959', 'step': 3434, 'epoch': 3} {'type': 'loss', 'content': 0.03193148225545883, 'timestamp': '2025-09-04 04:16:23.609001', 'step': 3435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:23.711499', 'step': 3435, 'epoch': 3} {'type': 'loss', 'content': 0.0072656129486858845, 'timestamp': '2025-09-04 04:16:23.731140', 'step': 3436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:23.830096', 'step': 3436, 'epoch': 3} {'type': 'loss', 'content': 0.0032099627424031496, 'timestamp': '2025-09-04 04:16:23.850782', 'step': 3437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:16:23.945449', 'step': 3437, 'epoch': 3} {'type': 'loss', 'content': 0.004939934704452753, 'timestamp': '2025-09-04 04:16:23.962953', 'step': 3438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:24.065224', 'step': 3438, 'epoch': 3} {'type': 'loss', 'content': 0.003717708634212613, 'timestamp': '2025-09-04 04:16:24.084445', 'step': 3439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:24.192910', 'step': 3439, 'epoch': 3} {'type': 'loss', 'content': 0.010961350053548813, 'timestamp': '2025-09-04 04:16:24.214147', 'step': 3440, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:16:32.619010', 'step': 3440, 'epoch': 3} {'type': 'pplx', 'content': 276.0165633644801, 'timestamp': '2025-09-04 04:16:32.621440', 'step': 3440, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3440', 'timestamp': '2025-09-04 04:16:33.156281', 'step': 3440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:33.245723', 'step': 3440, 'epoch': 3} {'type': 'loss', 'content': 0.0005490960320457816, 'timestamp': '2025-09-04 04:16:33.264393', 'step': 3441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:16:33.338981', 'step': 3441, 'epoch': 3} {'type': 'loss', 'content': 0.0026552164927124977, 'timestamp': '2025-09-04 04:16:33.352587', 'step': 3442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:16:33.460260', 'step': 3442, 'epoch': 3} {'type': 'loss', 'content': 0.0013740723952651024, 'timestamp': '2025-09-04 04:16:33.480498', 'step': 3443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:33.590052', 'step': 3443, 'epoch': 3} {'type': 'loss', 'content': 0.005307029001414776, 'timestamp': '2025-09-04 04:16:33.611410', 'step': 3444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:33.702176', 'step': 3444, 'epoch': 3} {'type': 'loss', 'content': 0.018815629184246063, 'timestamp': '2025-09-04 04:16:33.720515', 'step': 3445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:33.821133', 'step': 3445, 'epoch': 3} {'type': 'loss', 'content': 0.0011458718217909336, 'timestamp': '2025-09-04 04:16:33.839998', 'step': 3446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1408], 'flops': 28160171015680.0}, 'timestamp': '2025-09-04 04:16:34.047495', 'step': 3446, 'epoch': 3} {'type': 'loss', 'content': 0.0004305221955291927, 'timestamp': '2025-09-04 04:16:34.086809', 'step': 3447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:34.188269', 'step': 3447, 'epoch': 3} {'type': 'loss', 'content': 0.004843763541430235, 'timestamp': '2025-09-04 04:16:34.208262', 'step': 3448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:16:34.314432', 'step': 3448, 'epoch': 3} {'type': 'loss', 'content': 0.005847205873578787, 'timestamp': '2025-09-04 04:16:34.337045', 'step': 3449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:16:34.419987', 'step': 3449, 'epoch': 3} {'type': 'loss', 'content': 0.025259602814912796, 'timestamp': '2025-09-04 04:16:34.435272', 'step': 3450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:16:34.518599', 'step': 3450, 'epoch': 3} {'type': 'loss', 'content': 0.003980662208050489, 'timestamp': '2025-09-04 04:16:34.533632', 'step': 3451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:16:34.627948', 'step': 3451, 'epoch': 3} {'type': 'loss', 'content': 0.0030712997540831566, 'timestamp': '2025-09-04 04:16:34.646239', 'step': 3452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:34.748110', 'step': 3452, 'epoch': 3} {'type': 'loss', 'content': 0.049204930663108826, 'timestamp': '2025-09-04 04:16:34.769245', 'step': 3453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:34.860524', 'step': 3453, 'epoch': 3} {'type': 'loss', 'content': 0.00023195317771751434, 'timestamp': '2025-09-04 04:16:34.877272', 'step': 3454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:34.976625', 'step': 3454, 'epoch': 3} {'type': 'loss', 'content': 0.005655570421367884, 'timestamp': '2025-09-04 04:16:34.995186', 'step': 3455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:16:35.072036', 'step': 3455, 'epoch': 3} {'type': 'loss', 'content': 0.009977075271308422, 'timestamp': '2025-09-04 04:16:35.086890', 'step': 3456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:35.177118', 'step': 3456, 'epoch': 3} {'type': 'loss', 'content': 0.017296381294727325, 'timestamp': '2025-09-04 04:16:35.195820', 'step': 3457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:35.286844', 'step': 3457, 'epoch': 3} {'type': 'loss', 'content': 0.008046641014516354, 'timestamp': '2025-09-04 04:16:35.303590', 'step': 3458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:35.412460', 'step': 3458, 'epoch': 3} {'type': 'loss', 'content': 0.03136194869875908, 'timestamp': '2025-09-04 04:16:35.432984', 'step': 3459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:35.536842', 'step': 3459, 'epoch': 3} {'type': 'loss', 'content': 0.00904142763465643, 'timestamp': '2025-09-04 04:16:35.556939', 'step': 3460, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:16:43.938279', 'step': 3460, 'epoch': 3} {'type': 'pplx', 'content': 273.344454429984, 'timestamp': '2025-09-04 04:16:43.940762', 'step': 3460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:44.042144', 'step': 3460, 'epoch': 3} {'type': 'loss', 'content': 0.006052535958588123, 'timestamp': '2025-09-04 04:16:44.064039', 'step': 3461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:44.167096', 'step': 3461, 'epoch': 3} {'type': 'loss', 'content': 0.0018702381057664752, 'timestamp': '2025-09-04 04:16:44.186364', 'step': 3462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:44.289805', 'step': 3462, 'epoch': 3} {'type': 'loss', 'content': 0.006505020894110203, 'timestamp': '2025-09-04 04:16:44.309039', 'step': 3463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:44.414600', 'step': 3463, 'epoch': 3} {'type': 'loss', 'content': 0.006749553140252829, 'timestamp': '2025-09-04 04:16:44.435284', 'step': 3464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:44.527480', 'step': 3464, 'epoch': 3} {'type': 'loss', 'content': 0.011513489298522472, 'timestamp': '2025-09-04 04:16:44.546415', 'step': 3465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:44.649578', 'step': 3465, 'epoch': 3} {'type': 'loss', 'content': 0.0009676741319708526, 'timestamp': '2025-09-04 04:16:44.668399', 'step': 3466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 04:16:44.799523', 'step': 3466, 'epoch': 3} {'type': 'loss', 'content': 0.0007584612467326224, 'timestamp': '2025-09-04 04:16:44.824173', 'step': 3467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:44.925235', 'step': 3467, 'epoch': 3} {'type': 'loss', 'content': 0.051189690828323364, 'timestamp': '2025-09-04 04:16:44.944884', 'step': 3468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:45.043356', 'step': 3468, 'epoch': 3} {'type': 'loss', 'content': 0.004076420795172453, 'timestamp': '2025-09-04 04:16:45.063783', 'step': 3469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:45.169858', 'step': 3469, 'epoch': 3} {'type': 'loss', 'content': 0.01913578435778618, 'timestamp': '2025-09-04 04:16:45.189959', 'step': 3470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:16:45.283963', 'step': 3470, 'epoch': 3} {'type': 'loss', 'content': 0.0021679247729480267, 'timestamp': '2025-09-04 04:16:45.301246', 'step': 3471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:16:45.397107', 'step': 3471, 'epoch': 3} {'type': 'loss', 'content': 0.00030914912349544466, 'timestamp': '2025-09-04 04:16:45.413267', 'step': 3472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:16:45.495736', 'step': 3472, 'epoch': 3} {'type': 'loss', 'content': 0.0001749959192238748, 'timestamp': '2025-09-04 04:16:45.512249', 'step': 3473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:45.616646', 'step': 3473, 'epoch': 3} {'type': 'loss', 'content': 0.008923078887164593, 'timestamp': '2025-09-04 04:16:45.635799', 'step': 3474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:45.742204', 'step': 3474, 'epoch': 3} {'type': 'loss', 'content': 0.01262232568114996, 'timestamp': '2025-09-04 04:16:45.762090', 'step': 3475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:45.864266', 'step': 3475, 'epoch': 3} {'type': 'loss', 'content': 0.005357048008590937, 'timestamp': '2025-09-04 04:16:45.883695', 'step': 3476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:45.990063', 'step': 3476, 'epoch': 3} {'type': 'loss', 'content': 0.0005004971753805876, 'timestamp': '2025-09-04 04:16:46.012610', 'step': 3477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:46.112508', 'step': 3477, 'epoch': 3} {'type': 'loss', 'content': 0.004670882131904364, 'timestamp': '2025-09-04 04:16:46.131186', 'step': 3478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:46.240850', 'step': 3478, 'epoch': 3} {'type': 'loss', 'content': 0.001415494829416275, 'timestamp': '2025-09-04 04:16:46.261391', 'step': 3479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:46.354980', 'step': 3479, 'epoch': 3} {'type': 'loss', 'content': 0.0019339878344908357, 'timestamp': '2025-09-04 04:16:46.372907', 'step': 3480, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:16:54.758650', 'step': 3480, 'epoch': 3} {'type': 'pplx', 'content': 273.04229426249503, 'timestamp': '2025-09-04 04:16:54.760818', 'step': 3480, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3480', 'timestamp': '2025-09-04 04:16:55.106808', 'step': 3480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:55.197784', 'step': 3480, 'epoch': 3} {'type': 'loss', 'content': 0.007648364640772343, 'timestamp': '2025-09-04 04:16:55.216546', 'step': 3481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:16:55.311636', 'step': 3481, 'epoch': 3} {'type': 'loss', 'content': 0.003384833922609687, 'timestamp': '2025-09-04 04:16:55.329096', 'step': 3482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:55.421525', 'step': 3482, 'epoch': 3} {'type': 'loss', 'content': 0.0017845932161435485, 'timestamp': '2025-09-04 04:16:55.438702', 'step': 3483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:55.542236', 'step': 3483, 'epoch': 3} {'type': 'loss', 'content': 0.030296683311462402, 'timestamp': '2025-09-04 04:16:55.562301', 'step': 3484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:55.659953', 'step': 3484, 'epoch': 3} {'type': 'loss', 'content': 0.010239890776574612, 'timestamp': '2025-09-04 04:16:55.680631', 'step': 3485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:16:55.773577', 'step': 3485, 'epoch': 3} {'type': 'loss', 'content': 0.01184002310037613, 'timestamp': '2025-09-04 04:16:55.790695', 'step': 3486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:55.896552', 'step': 3486, 'epoch': 3} {'type': 'loss', 'content': 0.0026469461154192686, 'timestamp': '2025-09-04 04:16:55.916558', 'step': 3487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:56.020152', 'step': 3487, 'epoch': 3} {'type': 'loss', 'content': 0.006876929197460413, 'timestamp': '2025-09-04 04:16:56.040234', 'step': 3488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:16:56.147712', 'step': 3488, 'epoch': 3} {'type': 'loss', 'content': 0.012749651446938515, 'timestamp': '2025-09-04 04:16:56.170254', 'step': 3489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:56.273664', 'step': 3489, 'epoch': 3} {'type': 'loss', 'content': 0.00782372523099184, 'timestamp': '2025-09-04 04:16:56.292933', 'step': 3490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:16:56.378232', 'step': 3490, 'epoch': 3} {'type': 'loss', 'content': 0.0010486006503924727, 'timestamp': '2025-09-04 04:16:56.393419', 'step': 3491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:56.493707', 'step': 3491, 'epoch': 3} {'type': 'loss', 'content': 0.0012736011995002627, 'timestamp': '2025-09-04 04:16:56.513374', 'step': 3492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:16:56.611057', 'step': 3492, 'epoch': 3} {'type': 'loss', 'content': 0.019962089136242867, 'timestamp': '2025-09-04 04:16:56.631424', 'step': 3493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:16:56.734929', 'step': 3493, 'epoch': 3} {'type': 'loss', 'content': 0.016266385093331337, 'timestamp': '2025-09-04 04:16:56.754022', 'step': 3494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:56.856315', 'step': 3494, 'epoch': 3} {'type': 'loss', 'content': 0.004476721398532391, 'timestamp': '2025-09-04 04:16:56.875280', 'step': 3495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:16:56.975567', 'step': 3495, 'epoch': 3} {'type': 'loss', 'content': 0.0004053797747474164, 'timestamp': '2025-09-04 04:16:56.995298', 'step': 3496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:57.095271', 'step': 3496, 'epoch': 3} {'type': 'loss', 'content': 0.016851192340254784, 'timestamp': '2025-09-04 04:16:57.116407', 'step': 3497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:16:57.223991', 'step': 3497, 'epoch': 3} {'type': 'loss', 'content': 0.002290160395205021, 'timestamp': '2025-09-04 04:16:57.244053', 'step': 3498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:16:57.334762', 'step': 3498, 'epoch': 3} {'type': 'loss', 'content': 0.009426870383322239, 'timestamp': '2025-09-04 04:16:57.351647', 'step': 3499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:16:57.454684', 'step': 3499, 'epoch': 3} {'type': 'loss', 'content': 0.010544263757765293, 'timestamp': '2025-09-04 04:16:57.474898', 'step': 3500, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:17:05.851374', 'step': 3500, 'epoch': 3} {'type': 'pplx', 'content': 278.8655476522292, 'timestamp': '2025-09-04 04:17:05.854405', 'step': 3500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:05.952478', 'step': 3500, 'epoch': 3} {'type': 'loss', 'content': 0.006872169207781553, 'timestamp': '2025-09-04 04:17:05.973642', 'step': 3501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:06.067259', 'step': 3501, 'epoch': 3} {'type': 'loss', 'content': 0.010031554847955704, 'timestamp': '2025-09-04 04:17:06.084553', 'step': 3502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:06.174657', 'step': 3502, 'epoch': 3} {'type': 'loss', 'content': 0.005660749971866608, 'timestamp': '2025-09-04 04:17:06.191462', 'step': 3503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:17:06.276212', 'step': 3503, 'epoch': 3} {'type': 'loss', 'content': 0.009915877133607864, 'timestamp': '2025-09-04 04:17:06.292393', 'step': 3504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:06.383845', 'step': 3504, 'epoch': 3} {'type': 'loss', 'content': 0.004943343810737133, 'timestamp': '2025-09-04 04:17:06.403017', 'step': 3505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:06.504669', 'step': 3505, 'epoch': 3} {'type': 'loss', 'content': 0.0015765562420710921, 'timestamp': '2025-09-04 04:17:06.523796', 'step': 3506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:06.616657', 'step': 3506, 'epoch': 3} {'type': 'loss', 'content': 0.0038354985881596804, 'timestamp': '2025-09-04 04:17:06.633845', 'step': 3507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:06.750740', 'step': 3507, 'epoch': 3} {'type': 'loss', 'content': 0.0014609359204769135, 'timestamp': '2025-09-04 04:17:06.770698', 'step': 3508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:06.873650', 'step': 3508, 'epoch': 3} {'type': 'loss', 'content': 0.002754951361566782, 'timestamp': '2025-09-04 04:17:06.895603', 'step': 3509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:06.985320', 'step': 3509, 'epoch': 3} {'type': 'loss', 'content': 0.05803408473730087, 'timestamp': '2025-09-04 04:17:07.002134', 'step': 3510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:07.104670', 'step': 3510, 'epoch': 3} {'type': 'loss', 'content': 0.002227720571681857, 'timestamp': '2025-09-04 04:17:07.123858', 'step': 3511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:07.224100', 'step': 3511, 'epoch': 3} {'type': 'loss', 'content': 0.00011479722161311656, 'timestamp': '2025-09-04 04:17:07.243742', 'step': 3512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:07.334422', 'step': 3512, 'epoch': 3} {'type': 'loss', 'content': 0.0112196309491992, 'timestamp': '2025-09-04 04:17:07.353494', 'step': 3513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:07.447008', 'step': 3513, 'epoch': 3} {'type': 'loss', 'content': 0.00041249426431022584, 'timestamp': '2025-09-04 04:17:07.464442', 'step': 3514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:07.567901', 'step': 3514, 'epoch': 3} {'type': 'loss', 'content': 0.003908245358616114, 'timestamp': '2025-09-04 04:17:07.587156', 'step': 3515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:07.678752', 'step': 3515, 'epoch': 3} {'type': 'loss', 'content': 0.04177214950323105, 'timestamp': '2025-09-04 04:17:07.696276', 'step': 3516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:07.795192', 'step': 3516, 'epoch': 3} {'type': 'loss', 'content': 0.060547590255737305, 'timestamp': '2025-09-04 04:17:07.815910', 'step': 3517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:07.917711', 'step': 3517, 'epoch': 3} {'type': 'loss', 'content': 0.0007211702759377658, 'timestamp': '2025-09-04 04:17:07.936836', 'step': 3518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:17:08.071144', 'step': 3518, 'epoch': 3} {'type': 'loss', 'content': 0.0037805659230798483, 'timestamp': '2025-09-04 04:17:08.097057', 'step': 3519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:08.190982', 'step': 3519, 'epoch': 3} {'type': 'loss', 'content': 0.007106819190084934, 'timestamp': '2025-09-04 04:17:08.209152', 'step': 3520, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:17:16.652692', 'step': 3520, 'epoch': 3} {'type': 'pplx', 'content': 285.4826197362545, 'timestamp': '2025-09-04 04:17:16.656011', 'step': 3520, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3520', 'timestamp': '2025-09-04 04:17:17.176950', 'step': 3520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:17.280941', 'step': 3520, 'epoch': 3} {'type': 'loss', 'content': 0.00228358106687665, 'timestamp': '2025-09-04 04:17:17.302185', 'step': 3521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:17:17.387962', 'step': 3521, 'epoch': 3} {'type': 'loss', 'content': 0.005259730387479067, 'timestamp': '2025-09-04 04:17:17.401639', 'step': 3522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:17.494562', 'step': 3522, 'epoch': 3} {'type': 'loss', 'content': 0.0028661582618951797, 'timestamp': '2025-09-04 04:17:17.511807', 'step': 3523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:17:17.646766', 'step': 3523, 'epoch': 3} {'type': 'loss', 'content': 0.0053170472383499146, 'timestamp': '2025-09-04 04:17:17.673610', 'step': 3524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:17.776209', 'step': 3524, 'epoch': 3} {'type': 'loss', 'content': 0.010155066847801208, 'timestamp': '2025-09-04 04:17:17.797350', 'step': 3525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:17:17.893899', 'step': 3525, 'epoch': 3} {'type': 'loss', 'content': 0.011916323564946651, 'timestamp': '2025-09-04 04:17:17.911517', 'step': 3526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:18.002884', 'step': 3526, 'epoch': 3} {'type': 'loss', 'content': 0.0006236597546376288, 'timestamp': '2025-09-04 04:17:18.019750', 'step': 3527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:17:18.095835', 'step': 3527, 'epoch': 3} {'type': 'loss', 'content': 0.007811339106410742, 'timestamp': '2025-09-04 04:17:18.110506', 'step': 3528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:17:18.207549', 'step': 3528, 'epoch': 3} {'type': 'loss', 'content': 0.0019810060039162636, 'timestamp': '2025-09-04 04:17:18.227876', 'step': 3529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:17:18.339268', 'step': 3529, 'epoch': 3} {'type': 'loss', 'content': 0.0015669839922338724, 'timestamp': '2025-09-04 04:17:18.359806', 'step': 3530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:18.457156', 'step': 3530, 'epoch': 3} {'type': 'loss', 'content': 0.022024275735020638, 'timestamp': '2025-09-04 04:17:18.474719', 'step': 3531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:17:18.606447', 'step': 3531, 'epoch': 3} {'type': 'loss', 'content': 0.00185906991828233, 'timestamp': '2025-09-04 04:17:18.630444', 'step': 3532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:18.730940', 'step': 3532, 'epoch': 3} {'type': 'loss', 'content': 0.0033666298259049654, 'timestamp': '2025-09-04 04:17:18.752102', 'step': 3533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 04:17:18.926745', 'step': 3533, 'epoch': 3} {'type': 'loss', 'content': 0.0011104767909273505, 'timestamp': '2025-09-04 04:17:18.959381', 'step': 3534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:19.067563', 'step': 3534, 'epoch': 3} {'type': 'loss', 'content': 0.0017678681761026382, 'timestamp': '2025-09-04 04:17:19.087547', 'step': 3535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:19.195375', 'step': 3535, 'epoch': 3} {'type': 'loss', 'content': 0.003609925974160433, 'timestamp': '2025-09-04 04:17:19.216154', 'step': 3536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:19.304973', 'step': 3536, 'epoch': 3} {'type': 'loss', 'content': 0.021984629333019257, 'timestamp': '2025-09-04 04:17:19.323469', 'step': 3537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:19.429598', 'step': 3537, 'epoch': 3} {'type': 'loss', 'content': 0.00020458322251215577, 'timestamp': '2025-09-04 04:17:19.449567', 'step': 3538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:17:19.535223', 'step': 3538, 'epoch': 3} {'type': 'loss', 'content': 0.01300242729485035, 'timestamp': '2025-09-04 04:17:19.550684', 'step': 3539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:19.667433', 'step': 3539, 'epoch': 3} {'type': 'loss', 'content': 0.01605132967233658, 'timestamp': '2025-09-04 04:17:19.688333', 'step': 3540, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:17:28.163839', 'step': 3540, 'epoch': 3} {'type': 'pplx', 'content': 287.8090877611623, 'timestamp': '2025-09-04 04:17:28.165953', 'step': 3540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:28.267471', 'step': 3540, 'epoch': 3} {'type': 'loss', 'content': 0.0015539666637778282, 'timestamp': '2025-09-04 04:17:28.289355', 'step': 3541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:17:28.493842', 'step': 3541, 'epoch': 3} {'type': 'loss', 'content': 0.029507221654057503, 'timestamp': '2025-09-04 04:17:28.532966', 'step': 3542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:17:28.612593', 'step': 3542, 'epoch': 3} {'type': 'loss', 'content': 0.0015970682725310326, 'timestamp': '2025-09-04 04:17:28.626758', 'step': 3543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:17:28.744337', 'step': 3543, 'epoch': 3} {'type': 'loss', 'content': 0.008442920632660389, 'timestamp': '2025-09-04 04:17:28.767276', 'step': 3544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:17:28.873521', 'step': 3544, 'epoch': 3} {'type': 'loss', 'content': 0.027983790263533592, 'timestamp': '2025-09-04 04:17:28.895818', 'step': 3545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:28.987931', 'step': 3545, 'epoch': 3} {'type': 'loss', 'content': 0.0004234362568240613, 'timestamp': '2025-09-04 04:17:29.004675', 'step': 3546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:17:29.090019', 'step': 3546, 'epoch': 3} {'type': 'loss', 'content': 0.0007317407871596515, 'timestamp': '2025-09-04 04:17:29.105455', 'step': 3547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:29.208608', 'step': 3547, 'epoch': 3} {'type': 'loss', 'content': 0.00967488158494234, 'timestamp': '2025-09-04 04:17:29.228533', 'step': 3548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:17:29.321831', 'step': 3548, 'epoch': 3} {'type': 'loss', 'content': 0.02515988238155842, 'timestamp': '2025-09-04 04:17:29.341037', 'step': 3549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 04:17:29.514992', 'step': 3549, 'epoch': 3} {'type': 'loss', 'content': 0.000993523863144219, 'timestamp': '2025-09-04 04:17:29.549641', 'step': 3550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:29.645946', 'step': 3550, 'epoch': 3} {'type': 'loss', 'content': 0.000763634976465255, 'timestamp': '2025-09-04 04:17:29.663104', 'step': 3551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:17:29.747319', 'step': 3551, 'epoch': 3} {'type': 'loss', 'content': 0.000361975806299597, 'timestamp': '2025-09-04 04:17:29.763321', 'step': 3552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:17:29.846943', 'step': 3552, 'epoch': 3} {'type': 'loss', 'content': 0.0030570703092962503, 'timestamp': '2025-09-04 04:17:29.864182', 'step': 3553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:17:29.941189', 'step': 3553, 'epoch': 3} {'type': 'loss', 'content': 0.0028307621832937002, 'timestamp': '2025-09-04 04:17:29.955338', 'step': 3554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:30.060042', 'step': 3554, 'epoch': 3} {'type': 'loss', 'content': 0.0015227803960442543, 'timestamp': '2025-09-04 04:17:30.079268', 'step': 3555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:17:30.184667', 'step': 3555, 'epoch': 3} {'type': 'loss', 'content': 0.013519185595214367, 'timestamp': '2025-09-04 04:17:30.202939', 'step': 3556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:17:30.308622', 'step': 3556, 'epoch': 3} {'type': 'loss', 'content': 0.0007381472387351096, 'timestamp': '2025-09-04 04:17:30.330915', 'step': 3557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:17:30.534526', 'step': 3557, 'epoch': 3} {'type': 'loss', 'content': 0.0012235705507919192, 'timestamp': '2025-09-04 04:17:30.573628', 'step': 3558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:30.683910', 'step': 3558, 'epoch': 3} {'type': 'loss', 'content': 0.007621685042977333, 'timestamp': '2025-09-04 04:17:30.703119', 'step': 3559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:17:30.808101', 'step': 3559, 'epoch': 3} {'type': 'loss', 'content': 0.0009364414145238698, 'timestamp': '2025-09-04 04:17:30.827407', 'step': 3560, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:17:39.214990', 'step': 3560, 'epoch': 3} {'type': 'pplx', 'content': 289.38046074506144, 'timestamp': '2025-09-04 04:17:39.216730', 'step': 3560, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3560', 'timestamp': '2025-09-04 04:17:39.578194', 'step': 3560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:17:39.694895', 'step': 3560, 'epoch': 3} {'type': 'loss', 'content': 0.09077467024326324, 'timestamp': '2025-09-04 04:17:39.718681', 'step': 3561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:39.822623', 'step': 3561, 'epoch': 3} {'type': 'loss', 'content': 0.001359087647870183, 'timestamp': '2025-09-04 04:17:39.841874', 'step': 3562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:39.944941', 'step': 3562, 'epoch': 3} {'type': 'loss', 'content': 0.0016831067623570561, 'timestamp': '2025-09-04 04:17:39.964001', 'step': 3563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:17:40.059536', 'step': 3563, 'epoch': 3} {'type': 'loss', 'content': 0.04928234592080116, 'timestamp': '2025-09-04 04:17:40.077782', 'step': 3564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:17:40.170060', 'step': 3564, 'epoch': 3} {'type': 'loss', 'content': 0.02024707943201065, 'timestamp': '2025-09-04 04:17:40.189078', 'step': 3565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:40.289261', 'step': 3565, 'epoch': 3} {'type': 'loss', 'content': 0.02083570696413517, 'timestamp': '2025-09-04 04:17:40.308255', 'step': 3566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:40.409586', 'step': 3566, 'epoch': 3} {'type': 'loss', 'content': 0.006988304201513529, 'timestamp': '2025-09-04 04:17:40.428713', 'step': 3567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:40.532535', 'step': 3567, 'epoch': 3} {'type': 'loss', 'content': 0.005976350978016853, 'timestamp': '2025-09-04 04:17:40.552587', 'step': 3568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:17:40.625938', 'step': 3568, 'epoch': 3} {'type': 'loss', 'content': 0.006792505271732807, 'timestamp': '2025-09-04 04:17:40.640617', 'step': 3569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:40.742711', 'step': 3569, 'epoch': 3} {'type': 'loss', 'content': 0.004221724346280098, 'timestamp': '2025-09-04 04:17:40.761826', 'step': 3570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:40.863583', 'step': 3570, 'epoch': 3} {'type': 'loss', 'content': 0.0009288810542784631, 'timestamp': '2025-09-04 04:17:40.882442', 'step': 3571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:17:40.982227', 'step': 3571, 'epoch': 3} {'type': 'loss', 'content': 0.0005838017095811665, 'timestamp': '2025-09-04 04:17:41.001628', 'step': 3572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:41.090099', 'step': 3572, 'epoch': 3} {'type': 'loss', 'content': 0.0032299798913300037, 'timestamp': '2025-09-04 04:17:41.108485', 'step': 3573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:17:41.186326', 'step': 3573, 'epoch': 3} {'type': 'loss', 'content': 0.0007045165402814746, 'timestamp': '2025-09-04 04:17:41.200322', 'step': 3574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1472], 'flops': 29440178786048.0}, 'timestamp': '2025-09-04 04:17:41.415846', 'step': 3574, 'epoch': 3} {'type': 'loss', 'content': 0.004880525637418032, 'timestamp': '2025-09-04 04:17:41.456770', 'step': 3575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:17:41.552296', 'step': 3575, 'epoch': 3} {'type': 'loss', 'content': 0.01930762641131878, 'timestamp': '2025-09-04 04:17:41.570598', 'step': 3576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:17:41.646968', 'step': 3576, 'epoch': 3} {'type': 'loss', 'content': 0.006385389715433121, 'timestamp': '2025-09-04 04:17:41.662283', 'step': 3577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:41.765485', 'step': 3577, 'epoch': 3} {'type': 'loss', 'content': 0.004767129663378, 'timestamp': '2025-09-04 04:17:41.784700', 'step': 3578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 864], 'flops': 17280104967552.0}, 'timestamp': '2025-09-04 04:17:41.912097', 'step': 3578, 'epoch': 3} {'type': 'loss', 'content': 0.000834185048006475, 'timestamp': '2025-09-04 04:17:41.936484', 'step': 3579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:42.040776', 'step': 3579, 'epoch': 3} {'type': 'loss', 'content': 0.00024679276975803077, 'timestamp': '2025-09-04 04:17:42.060779', 'step': 3580, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:17:50.483560', 'step': 3580, 'epoch': 3} {'type': 'pplx', 'content': 284.2367375279463, 'timestamp': '2025-09-04 04:17:50.485827', 'step': 3580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:17:50.560931', 'step': 3580, 'epoch': 3} {'type': 'loss', 'content': 0.0006078935693949461, 'timestamp': '2025-09-04 04:17:50.576216', 'step': 3581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:17:50.681804', 'step': 3581, 'epoch': 3} {'type': 'loss', 'content': 0.0132825942710042, 'timestamp': '2025-09-04 04:17:50.700909', 'step': 3582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:50.804294', 'step': 3582, 'epoch': 3} {'type': 'loss', 'content': 0.012423519045114517, 'timestamp': '2025-09-04 04:17:50.823176', 'step': 3583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:17:50.903122', 'step': 3583, 'epoch': 3} {'type': 'loss', 'content': 0.00610441155731678, 'timestamp': '2025-09-04 04:17:50.918030', 'step': 3584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:51.016416', 'step': 3584, 'epoch': 3} {'type': 'loss', 'content': 0.03366504982113838, 'timestamp': '2025-09-04 04:17:51.037152', 'step': 3585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:17:51.145741', 'step': 3585, 'epoch': 3} {'type': 'loss', 'content': 0.0012228694977238774, 'timestamp': '2025-09-04 04:17:51.166017', 'step': 3586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:51.257368', 'step': 3586, 'epoch': 3} {'type': 'loss', 'content': 0.0005438943044282496, 'timestamp': '2025-09-04 04:17:51.274200', 'step': 3587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:17:51.362295', 'step': 3587, 'epoch': 3} {'type': 'loss', 'content': 0.007683582603931427, 'timestamp': '2025-09-04 04:17:51.378679', 'step': 3588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:17:51.469388', 'step': 3588, 'epoch': 3} {'type': 'loss', 'content': 0.0010738805867731571, 'timestamp': '2025-09-04 04:17:51.487793', 'step': 3589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:17:51.563545', 'step': 3589, 'epoch': 3} {'type': 'loss', 'content': 0.0031101806089282036, 'timestamp': '2025-09-04 04:17:51.577287', 'step': 3590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:51.676508', 'step': 3590, 'epoch': 3} {'type': 'loss', 'content': 0.09019530564546585, 'timestamp': '2025-09-04 04:17:51.693617', 'step': 3591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:51.794524', 'step': 3591, 'epoch': 3} {'type': 'loss', 'content': 0.17219799757003784, 'timestamp': '2025-09-04 04:17:51.814168', 'step': 3592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:17:51.915283', 'step': 3592, 'epoch': 3} {'type': 'loss', 'content': 0.005689322482794523, 'timestamp': '2025-09-04 04:17:51.936352', 'step': 3593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:52.032479', 'step': 3593, 'epoch': 3} {'type': 'loss', 'content': 0.018983401358127594, 'timestamp': '2025-09-04 04:17:52.049611', 'step': 3594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:52.150681', 'step': 3594, 'epoch': 3} {'type': 'loss', 'content': 0.0016854063142091036, 'timestamp': '2025-09-04 04:17:52.169545', 'step': 3595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:17:52.277625', 'step': 3595, 'epoch': 3} {'type': 'loss', 'content': 0.0018568774685263634, 'timestamp': '2025-09-04 04:17:52.298316', 'step': 3596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:52.396728', 'step': 3596, 'epoch': 3} {'type': 'loss', 'content': 0.0010966422269120812, 'timestamp': '2025-09-04 04:17:52.417397', 'step': 3597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:17:52.510543', 'step': 3597, 'epoch': 3} {'type': 'loss', 'content': 0.005582905374467373, 'timestamp': '2025-09-04 04:17:52.527659', 'step': 3598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:17:52.630415', 'step': 3598, 'epoch': 3} {'type': 'loss', 'content': 0.006567910313606262, 'timestamp': '2025-09-04 04:17:52.649399', 'step': 3599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:17:52.760999', 'step': 3599, 'epoch': 3} {'type': 'loss', 'content': 0.021000593900680542, 'timestamp': '2025-09-04 04:17:52.782422', 'step': 3600, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:01.223292', 'step': 3600, 'epoch': 3} {'type': 'pplx', 'content': 273.946033699607, 'timestamp': '2025-09-04 04:18:01.225358', 'step': 3600, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3600', 'timestamp': '2025-09-04 04:18:01.586167', 'step': 3600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:18:01.659422', 'step': 3600, 'epoch': 3} {'type': 'loss', 'content': 0.009026916697621346, 'timestamp': '2025-09-04 04:18:01.674374', 'step': 3601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:01.776530', 'step': 3601, 'epoch': 3} {'type': 'loss', 'content': 0.0015143795171752572, 'timestamp': '2025-09-04 04:18:01.795323', 'step': 3602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:18:01.880426', 'step': 3602, 'epoch': 3} {'type': 'loss', 'content': 0.012896225787699223, 'timestamp': '2025-09-04 04:18:01.895890', 'step': 3603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:18:01.989210', 'step': 3603, 'epoch': 3} {'type': 'loss', 'content': 0.014396066777408123, 'timestamp': '2025-09-04 04:18:02.007200', 'step': 3604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:02.097931', 'step': 3604, 'epoch': 3} {'type': 'loss', 'content': 0.0015026642940938473, 'timestamp': '2025-09-04 04:18:02.117140', 'step': 3605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:02.208664', 'step': 3605, 'epoch': 3} {'type': 'loss', 'content': 0.0049162269569933414, 'timestamp': '2025-09-04 04:18:02.225558', 'step': 3606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:02.329504', 'step': 3606, 'epoch': 3} {'type': 'loss', 'content': 0.0029911664314568043, 'timestamp': '2025-09-04 04:18:02.348781', 'step': 3607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:18:02.458731', 'step': 3607, 'epoch': 3} {'type': 'loss', 'content': 0.005600781179964542, 'timestamp': '2025-09-04 04:18:02.480005', 'step': 3608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:18:02.570782', 'step': 3608, 'epoch': 3} {'type': 'loss', 'content': 0.0017589996568858624, 'timestamp': '2025-09-04 04:18:02.589673', 'step': 3609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:02.692776', 'step': 3609, 'epoch': 3} {'type': 'loss', 'content': 0.0030466015450656414, 'timestamp': '2025-09-04 04:18:02.711967', 'step': 3610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:02.815915', 'step': 3610, 'epoch': 3} {'type': 'loss', 'content': 0.000875060330145061, 'timestamp': '2025-09-04 04:18:02.835190', 'step': 3611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:02.930579', 'step': 3611, 'epoch': 3} {'type': 'loss', 'content': 0.000565591617487371, 'timestamp': '2025-09-04 04:18:02.948740', 'step': 3612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:03.048836', 'step': 3612, 'epoch': 3} {'type': 'loss', 'content': 0.001949289464391768, 'timestamp': '2025-09-04 04:18:03.069950', 'step': 3613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:18:03.179274', 'step': 3613, 'epoch': 3} {'type': 'loss', 'content': 0.002045362489297986, 'timestamp': '2025-09-04 04:18:03.199578', 'step': 3614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:03.303512', 'step': 3614, 'epoch': 3} {'type': 'loss', 'content': 0.025945372879505157, 'timestamp': '2025-09-04 04:18:03.322728', 'step': 3615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:18:03.409357', 'step': 3615, 'epoch': 3} {'type': 'loss', 'content': 0.003748838324099779, 'timestamp': '2025-09-04 04:18:03.425852', 'step': 3616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:18:03.508941', 'step': 3616, 'epoch': 3} {'type': 'loss', 'content': 0.031195957213640213, 'timestamp': '2025-09-04 04:18:03.525966', 'step': 3617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:03.617028', 'step': 3617, 'epoch': 3} {'type': 'loss', 'content': 0.0037481507752090693, 'timestamp': '2025-09-04 04:18:03.633901', 'step': 3618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:03.734574', 'step': 3618, 'epoch': 3} {'type': 'loss', 'content': 0.0022598407231271267, 'timestamp': '2025-09-04 04:18:03.753419', 'step': 3619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:03.856714', 'step': 3619, 'epoch': 3} {'type': 'loss', 'content': 0.017032302916049957, 'timestamp': '2025-09-04 04:18:03.876386', 'step': 3620, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:12.287116', 'step': 3620, 'epoch': 3} {'type': 'pplx', 'content': 259.11186362945296, 'timestamp': '2025-09-04 04:18:12.288980', 'step': 3620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:18:12.368787', 'step': 3620, 'epoch': 3} {'type': 'loss', 'content': 0.005365411285310984, 'timestamp': '2025-09-04 04:18:12.385332', 'step': 3621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:18:12.495586', 'step': 3621, 'epoch': 3} {'type': 'loss', 'content': 0.03964783623814583, 'timestamp': '2025-09-04 04:18:12.515826', 'step': 3622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:18:12.651939', 'step': 3622, 'epoch': 3} {'type': 'loss', 'content': 0.005470898933708668, 'timestamp': '2025-09-04 04:18:12.678090', 'step': 3623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:12.779972', 'step': 3623, 'epoch': 3} {'type': 'loss', 'content': 0.0020204551983624697, 'timestamp': '2025-09-04 04:18:12.799866', 'step': 3624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:18:12.903116', 'step': 3624, 'epoch': 3} {'type': 'loss', 'content': 0.004941796418279409, 'timestamp': '2025-09-04 04:18:12.924916', 'step': 3625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:13.028080', 'step': 3625, 'epoch': 3} {'type': 'loss', 'content': 0.00373165775090456, 'timestamp': '2025-09-04 04:18:13.047297', 'step': 3626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:13.146084', 'step': 3626, 'epoch': 3} {'type': 'loss', 'content': 0.00431425916031003, 'timestamp': '2025-09-04 04:18:13.164715', 'step': 3627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:18:13.247659', 'step': 3627, 'epoch': 3} {'type': 'loss', 'content': 0.009402657859027386, 'timestamp': '2025-09-04 04:18:13.263508', 'step': 3628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:18:13.370905', 'step': 3628, 'epoch': 3} {'type': 'loss', 'content': 0.0007547914865426719, 'timestamp': '2025-09-04 04:18:13.393375', 'step': 3629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:18:13.487383', 'step': 3629, 'epoch': 3} {'type': 'loss', 'content': 0.0023410057183355093, 'timestamp': '2025-09-04 04:18:13.504478', 'step': 3630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:13.605540', 'step': 3630, 'epoch': 3} {'type': 'loss', 'content': 0.008979837410151958, 'timestamp': '2025-09-04 04:18:13.624342', 'step': 3631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1168], 'flops': 23360141876800.0}, 'timestamp': '2025-09-04 04:18:13.800587', 'step': 3631, 'epoch': 3} {'type': 'loss', 'content': 0.0013831626856699586, 'timestamp': '2025-09-04 04:18:13.833973', 'step': 3632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:18:13.941465', 'step': 3632, 'epoch': 3} {'type': 'loss', 'content': 0.000762230425607413, 'timestamp': '2025-09-04 04:18:13.964016', 'step': 3633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:14.067950', 'step': 3633, 'epoch': 3} {'type': 'loss', 'content': 0.001655717147514224, 'timestamp': '2025-09-04 04:18:14.087177', 'step': 3634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:18:14.165208', 'step': 3634, 'epoch': 3} {'type': 'loss', 'content': 0.00832951907068491, 'timestamp': '2025-09-04 04:18:14.179160', 'step': 3635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:14.283512', 'step': 3635, 'epoch': 3} {'type': 'loss', 'content': 0.02003934420645237, 'timestamp': '2025-09-04 04:18:14.303559', 'step': 3636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:14.404763', 'step': 3636, 'epoch': 3} {'type': 'loss', 'content': 0.003556882031261921, 'timestamp': '2025-09-04 04:18:14.425768', 'step': 3637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:18:14.536861', 'step': 3637, 'epoch': 3} {'type': 'loss', 'content': 0.010267372243106365, 'timestamp': '2025-09-04 04:18:14.557550', 'step': 3638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:14.659506', 'step': 3638, 'epoch': 3} {'type': 'loss', 'content': 0.004329850431531668, 'timestamp': '2025-09-04 04:18:14.678378', 'step': 3639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:14.779574', 'step': 3639, 'epoch': 3} {'type': 'loss', 'content': 0.0016127810813486576, 'timestamp': '2025-09-04 04:18:14.799189', 'step': 3640, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:23.369155', 'step': 3640, 'epoch': 3} {'type': 'pplx', 'content': 255.24736725474492, 'timestamp': '2025-09-04 04:18:23.373987', 'step': 3640, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3640', 'timestamp': '2025-09-04 04:18:23.755326', 'step': 3640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:18:23.832501', 'step': 3640, 'epoch': 3} {'type': 'loss', 'content': 0.0019840069580823183, 'timestamp': '2025-09-04 04:18:23.847882', 'step': 3641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:23.950295', 'step': 3641, 'epoch': 3} {'type': 'loss', 'content': 0.011246290989220142, 'timestamp': '2025-09-04 04:18:23.969312', 'step': 3642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 880], 'flops': 17600106910144.0}, 'timestamp': '2025-09-04 04:18:24.100055', 'step': 3642, 'epoch': 3} {'type': 'loss', 'content': 0.0008128046174533665, 'timestamp': '2025-09-04 04:18:24.123694', 'step': 3643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:24.217210', 'step': 3643, 'epoch': 3} {'type': 'loss', 'content': 0.002894132863730192, 'timestamp': '2025-09-04 04:18:24.234773', 'step': 3644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1008], 'flops': 20160122450880.0}, 'timestamp': '2025-09-04 04:18:24.379481', 'step': 3644, 'epoch': 3} {'type': 'loss', 'content': 0.0007267083274200559, 'timestamp': '2025-09-04 04:18:24.410597', 'step': 3645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:24.514114', 'step': 3645, 'epoch': 3} {'type': 'loss', 'content': 0.003020522417500615, 'timestamp': '2025-09-04 04:18:24.533436', 'step': 3646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:18:24.642738', 'step': 3646, 'epoch': 3} {'type': 'loss', 'content': 0.007278635632246733, 'timestamp': '2025-09-04 04:18:24.663068', 'step': 3647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:24.762487', 'step': 3647, 'epoch': 3} {'type': 'loss', 'content': 0.020761605352163315, 'timestamp': '2025-09-04 04:18:24.781998', 'step': 3648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:24.885520', 'step': 3648, 'epoch': 3} {'type': 'loss', 'content': 0.0007095049950294197, 'timestamp': '2025-09-04 04:18:24.906792', 'step': 3649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:25.028566', 'step': 3649, 'epoch': 3} {'type': 'loss', 'content': 0.002251858590170741, 'timestamp': '2025-09-04 04:18:25.047729', 'step': 3650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:18:25.126742', 'step': 3650, 'epoch': 3} {'type': 'loss', 'content': 0.0013295934768393636, 'timestamp': '2025-09-04 04:18:25.140980', 'step': 3651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:25.231776', 'step': 3651, 'epoch': 3} {'type': 'loss', 'content': 0.023495763540267944, 'timestamp': '2025-09-04 04:18:25.249318', 'step': 3652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:18:25.366140', 'step': 3652, 'epoch': 3} {'type': 'loss', 'content': 0.012384490109980106, 'timestamp': '2025-09-04 04:18:25.390466', 'step': 3653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:25.484835', 'step': 3653, 'epoch': 3} {'type': 'loss', 'content': 0.0022683092392981052, 'timestamp': '2025-09-04 04:18:25.502380', 'step': 3654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:18:25.577592', 'step': 3654, 'epoch': 3} {'type': 'loss', 'content': 0.0017986752791330218, 'timestamp': '2025-09-04 04:18:25.591074', 'step': 3655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:25.694822', 'step': 3655, 'epoch': 3} {'type': 'loss', 'content': 0.007066111546009779, 'timestamp': '2025-09-04 04:18:25.714876', 'step': 3656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:25.811673', 'step': 3656, 'epoch': 3} {'type': 'loss', 'content': 0.002684858627617359, 'timestamp': '2025-09-04 04:18:25.832055', 'step': 3657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 928], 'flops': 18560112737920.0}, 'timestamp': '2025-09-04 04:18:25.966783', 'step': 3657, 'epoch': 3} {'type': 'loss', 'content': 0.001188501249998808, 'timestamp': '2025-09-04 04:18:25.992867', 'step': 3658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:26.095363', 'step': 3658, 'epoch': 3} {'type': 'loss', 'content': 0.005962354131042957, 'timestamp': '2025-09-04 04:18:26.114338', 'step': 3659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:18:26.201582', 'step': 3659, 'epoch': 3} {'type': 'loss', 'content': 0.0010241111740469933, 'timestamp': '2025-09-04 04:18:26.218028', 'step': 3660, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:34.655558', 'step': 3660, 'epoch': 3} {'type': 'pplx', 'content': 260.65734102692426, 'timestamp': '2025-09-04 04:18:34.657505', 'step': 3660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:34.756784', 'step': 3660, 'epoch': 3} {'type': 'loss', 'content': 0.0069060372188687325, 'timestamp': '2025-09-04 04:18:34.778088', 'step': 3661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:18:34.856779', 'step': 3661, 'epoch': 3} {'type': 'loss', 'content': 0.0014873266918584704, 'timestamp': '2025-09-04 04:18:34.871044', 'step': 3662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:18:34.948854', 'step': 3662, 'epoch': 3} {'type': 'loss', 'content': 0.005059961229562759, 'timestamp': '2025-09-04 04:18:34.963083', 'step': 3663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:18:35.081870', 'step': 3663, 'epoch': 3} {'type': 'loss', 'content': 0.00022246531443670392, 'timestamp': '2025-09-04 04:18:35.103259', 'step': 3664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:18:35.206724', 'step': 3664, 'epoch': 3} {'type': 'loss', 'content': 0.005162604618817568, 'timestamp': '2025-09-04 04:18:35.222293', 'step': 3665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:35.326289', 'step': 3665, 'epoch': 3} {'type': 'loss', 'content': 0.0015102955512702465, 'timestamp': '2025-09-04 04:18:35.345704', 'step': 3666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:18:35.452901', 'step': 3666, 'epoch': 3} {'type': 'loss', 'content': 0.0005610152729786932, 'timestamp': '2025-09-04 04:18:35.472972', 'step': 3667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:35.578242', 'step': 3667, 'epoch': 3} {'type': 'loss', 'content': 0.002261679619550705, 'timestamp': '2025-09-04 04:18:35.598405', 'step': 3668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:18:35.706464', 'step': 3668, 'epoch': 3} {'type': 'loss', 'content': 0.009873145259916782, 'timestamp': '2025-09-04 04:18:35.728526', 'step': 3669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:35.822179', 'step': 3669, 'epoch': 3} {'type': 'loss', 'content': 0.0006896741688251495, 'timestamp': '2025-09-04 04:18:35.839074', 'step': 3670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:35.933182', 'step': 3670, 'epoch': 3} {'type': 'loss', 'content': 0.0011464201379567385, 'timestamp': '2025-09-04 04:18:35.950685', 'step': 3671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:36.044556', 'step': 3671, 'epoch': 3} {'type': 'loss', 'content': 0.001933246268890798, 'timestamp': '2025-09-04 04:18:36.062058', 'step': 3672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:18:36.153756', 'step': 3672, 'epoch': 3} {'type': 'loss', 'content': 0.005047465208917856, 'timestamp': '2025-09-04 04:18:36.173094', 'step': 3673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:18:36.264431', 'step': 3673, 'epoch': 3} {'type': 'loss', 'content': 0.01326705701649189, 'timestamp': '2025-09-04 04:18:36.279731', 'step': 3674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:36.390540', 'step': 3674, 'epoch': 3} {'type': 'loss', 'content': 0.001728372648358345, 'timestamp': '2025-09-04 04:18:36.409184', 'step': 3675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:36.511047', 'step': 3675, 'epoch': 3} {'type': 'loss', 'content': 0.00010324228787794709, 'timestamp': '2025-09-04 04:18:36.531104', 'step': 3676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:36.631106', 'step': 3676, 'epoch': 3} {'type': 'loss', 'content': 0.0006894250982441008, 'timestamp': '2025-09-04 04:18:36.651977', 'step': 3677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:18:36.737949', 'step': 3677, 'epoch': 3} {'type': 'loss', 'content': 0.0178501196205616, 'timestamp': '2025-09-04 04:18:36.753638', 'step': 3678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:36.847722', 'step': 3678, 'epoch': 3} {'type': 'loss', 'content': 0.004994639195501804, 'timestamp': '2025-09-04 04:18:36.865037', 'step': 3679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:36.970830', 'step': 3679, 'epoch': 3} {'type': 'loss', 'content': 9.363189019495621e-05, 'timestamp': '2025-09-04 04:18:36.991002', 'step': 3680, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:45.564502', 'step': 3680, 'epoch': 3} {'type': 'pplx', 'content': 269.183933665212, 'timestamp': '2025-09-04 04:18:45.567064', 'step': 3680, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3680', 'timestamp': '2025-09-04 04:18:45.962752', 'step': 3680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:46.060324', 'step': 3680, 'epoch': 3} {'type': 'loss', 'content': 0.016621418297290802, 'timestamp': '2025-09-04 04:18:46.080876', 'step': 3681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:18:46.188302', 'step': 3681, 'epoch': 3} {'type': 'loss', 'content': 0.010536265559494495, 'timestamp': '2025-09-04 04:18:46.208241', 'step': 3682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:46.313573', 'step': 3682, 'epoch': 3} {'type': 'loss', 'content': 0.010933700948953629, 'timestamp': '2025-09-04 04:18:46.332960', 'step': 3683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1424], 'flops': 28480172958272.0}, 'timestamp': '2025-09-04 04:18:46.545657', 'step': 3683, 'epoch': 3} {'type': 'loss', 'content': 0.006731206551194191, 'timestamp': '2025-09-04 04:18:46.587241', 'step': 3684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:46.686304', 'step': 3684, 'epoch': 3} {'type': 'loss', 'content': 0.017399858683347702, 'timestamp': '2025-09-04 04:18:46.706662', 'step': 3685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:46.797807', 'step': 3685, 'epoch': 3} {'type': 'loss', 'content': 0.0017027389258146286, 'timestamp': '2025-09-04 04:18:46.814735', 'step': 3686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:46.909873', 'step': 3686, 'epoch': 3} {'type': 'loss', 'content': 0.0014917801599949598, 'timestamp': '2025-09-04 04:18:46.927291', 'step': 3687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:47.032604', 'step': 3687, 'epoch': 3} {'type': 'loss', 'content': 0.007048693019896746, 'timestamp': '2025-09-04 04:18:47.052648', 'step': 3688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:18:47.134919', 'step': 3688, 'epoch': 3} {'type': 'loss', 'content': 0.023266077041625977, 'timestamp': '2025-09-04 04:18:47.151532', 'step': 3689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:18:47.260045', 'step': 3689, 'epoch': 3} {'type': 'loss', 'content': 0.005440382286906242, 'timestamp': '2025-09-04 04:18:47.280442', 'step': 3690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:47.392196', 'step': 3690, 'epoch': 3} {'type': 'loss', 'content': 0.005371682345867157, 'timestamp': '2025-09-04 04:18:47.411099', 'step': 3691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:18:47.506234', 'step': 3691, 'epoch': 3} {'type': 'loss', 'content': 0.03604043647646904, 'timestamp': '2025-09-04 04:18:47.524627', 'step': 3692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:18:47.598228', 'step': 3692, 'epoch': 3} {'type': 'loss', 'content': 0.016086634248495102, 'timestamp': '2025-09-04 04:18:47.613044', 'step': 3693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:18:47.712403', 'step': 3693, 'epoch': 3} {'type': 'loss', 'content': 0.003654019208624959, 'timestamp': '2025-09-04 04:18:47.731016', 'step': 3694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:18:47.849362', 'step': 3694, 'epoch': 3} {'type': 'loss', 'content': 0.0030856921803206205, 'timestamp': '2025-09-04 04:18:47.871448', 'step': 3695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:47.976083', 'step': 3695, 'epoch': 3} {'type': 'loss', 'content': 0.0017932873452082276, 'timestamp': '2025-09-04 04:18:47.996135', 'step': 3696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:18:48.139248', 'step': 3696, 'epoch': 3} {'type': 'loss', 'content': 0.0021279409993439913, 'timestamp': '2025-09-04 04:18:48.170357', 'step': 3697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:18:48.278683', 'step': 3697, 'epoch': 3} {'type': 'loss', 'content': 0.0009123628260567784, 'timestamp': '2025-09-04 04:18:48.299080', 'step': 3698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:18:48.395269', 'step': 3698, 'epoch': 3} {'type': 'loss', 'content': 0.00036065455060452223, 'timestamp': '2025-09-04 04:18:48.412856', 'step': 3699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:18:48.507808', 'step': 3699, 'epoch': 3} {'type': 'loss', 'content': 0.019889622926712036, 'timestamp': '2025-09-04 04:18:48.525968', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:18:57.005155', 'step': 3700, 'epoch': 3} {'type': 'pplx', 'content': 276.00496979251966, 'timestamp': '2025-09-04 04:18:57.008172', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:57.106990', 'step': 3700, 'epoch': 3} {'type': 'loss', 'content': 0.03130375221371651, 'timestamp': '2025-09-04 04:18:57.127767', 'step': 3701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:57.230808', 'step': 3701, 'epoch': 3} {'type': 'loss', 'content': 0.054532185196876526, 'timestamp': '2025-09-04 04:18:57.249702', 'step': 3702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:18:57.325769', 'step': 3702, 'epoch': 3} {'type': 'loss', 'content': 0.008000990375876427, 'timestamp': '2025-09-04 04:18:57.339443', 'step': 3703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:18:57.430429', 'step': 3703, 'epoch': 3} {'type': 'loss', 'content': 0.00035263263271190226, 'timestamp': '2025-09-04 04:18:57.448000', 'step': 3704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:18:57.554283', 'step': 3704, 'epoch': 3} {'type': 'loss', 'content': 0.003820637473836541, 'timestamp': '2025-09-04 04:18:57.576978', 'step': 3705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:18:57.672220', 'step': 3705, 'epoch': 3} {'type': 'loss', 'content': 0.006798545364290476, 'timestamp': '2025-09-04 04:18:57.689846', 'step': 3706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:18:57.793776', 'step': 3706, 'epoch': 3} {'type': 'loss', 'content': 0.0006438225973397493, 'timestamp': '2025-09-04 04:18:57.813047', 'step': 3707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:18:57.889419', 'step': 3707, 'epoch': 3} {'type': 'loss', 'content': 0.00427033007144928, 'timestamp': '2025-09-04 04:18:57.903748', 'step': 3708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:18:57.994336', 'step': 3708, 'epoch': 3} {'type': 'loss', 'content': 0.0027048927731812, 'timestamp': '2025-09-04 04:18:58.013185', 'step': 3709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:18:58.121817', 'step': 3709, 'epoch': 3} {'type': 'loss', 'content': 0.028957033529877663, 'timestamp': '2025-09-04 04:18:58.140618', 'step': 3710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:58.242805', 'step': 3710, 'epoch': 3} {'type': 'loss', 'content': 0.0006207419210113585, 'timestamp': '2025-09-04 04:18:58.261928', 'step': 3711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:18:58.372427', 'step': 3711, 'epoch': 3} {'type': 'loss', 'content': 0.011274183169007301, 'timestamp': '2025-09-04 04:18:58.393827', 'step': 3712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:18:58.474801', 'step': 3712, 'epoch': 3} {'type': 'loss', 'content': 0.001740701962262392, 'timestamp': '2025-09-04 04:18:58.491381', 'step': 3713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1184], 'flops': 23680143819392.0}, 'timestamp': '2025-09-04 04:18:58.662835', 'step': 3713, 'epoch': 3} {'type': 'loss', 'content': 0.008133734576404095, 'timestamp': '2025-09-04 04:18:58.697505', 'step': 3714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:58.799806', 'step': 3714, 'epoch': 3} {'type': 'loss', 'content': 0.011398537084460258, 'timestamp': '2025-09-04 04:18:58.819083', 'step': 3715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:58.922406', 'step': 3715, 'epoch': 3} {'type': 'loss', 'content': 0.006214221939444542, 'timestamp': '2025-09-04 04:18:58.942354', 'step': 3716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:18:59.044963', 'step': 3716, 'epoch': 3} {'type': 'loss', 'content': 0.00310851470567286, 'timestamp': '2025-09-04 04:18:59.066118', 'step': 3717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:18:59.142839', 'step': 3717, 'epoch': 3} {'type': 'loss', 'content': 0.003634780179709196, 'timestamp': '2025-09-04 04:18:59.156277', 'step': 3718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:18:59.252361', 'step': 3718, 'epoch': 3} {'type': 'loss', 'content': 0.0004398180462885648, 'timestamp': '2025-09-04 04:18:59.270038', 'step': 3719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:18:59.357108', 'step': 3719, 'epoch': 3} {'type': 'loss', 'content': 0.0019110249122604728, 'timestamp': '2025-09-04 04:18:59.373500', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:19:07.964079', 'step': 3720, 'epoch': 3} {'type': 'pplx', 'content': 281.6937485671799, 'timestamp': '2025-09-04 04:19:07.966081', 'step': 3720, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3720', 'timestamp': '2025-09-04 04:19:08.325947', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:19:08.399210', 'step': 3720, 'epoch': 3} {'type': 'loss', 'content': 0.012652603909373283, 'timestamp': '2025-09-04 04:19:08.414237', 'step': 3721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:19:08.508172', 'step': 3721, 'epoch': 3} {'type': 'loss', 'content': 0.007865807972848415, 'timestamp': '2025-09-04 04:19:08.525527', 'step': 3722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:08.628679', 'step': 3722, 'epoch': 3} {'type': 'loss', 'content': 0.03159695491194725, 'timestamp': '2025-09-04 04:19:08.647870', 'step': 3723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:08.735900', 'step': 3723, 'epoch': 3} {'type': 'loss', 'content': 0.004598596598953009, 'timestamp': '2025-09-04 04:19:08.752357', 'step': 3724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1232], 'flops': 24640149647168.0}, 'timestamp': '2025-09-04 04:19:08.931081', 'step': 3724, 'epoch': 3} {'type': 'loss', 'content': 0.013268624432384968, 'timestamp': '2025-09-04 04:19:08.968763', 'step': 3725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:09.054834', 'step': 3725, 'epoch': 3} {'type': 'loss', 'content': 0.0011762577341869473, 'timestamp': '2025-09-04 04:19:09.070399', 'step': 3726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:09.173921', 'step': 3726, 'epoch': 3} {'type': 'loss', 'content': 0.04035170376300812, 'timestamp': '2025-09-04 04:19:09.193294', 'step': 3727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:09.292993', 'step': 3727, 'epoch': 3} {'type': 'loss', 'content': 0.000641504826489836, 'timestamp': '2025-09-04 04:19:09.312305', 'step': 3728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 944], 'flops': 18880114680512.0}, 'timestamp': '2025-09-04 04:19:09.445418', 'step': 3728, 'epoch': 3} {'type': 'loss', 'content': 0.00042860963731072843, 'timestamp': '2025-09-04 04:19:09.474369', 'step': 3729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:09.561153', 'step': 3729, 'epoch': 3} {'type': 'loss', 'content': 0.01674291118979454, 'timestamp': '2025-09-04 04:19:09.576784', 'step': 3730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:19:09.687305', 'step': 3730, 'epoch': 3} {'type': 'loss', 'content': 0.04544749855995178, 'timestamp': '2025-09-04 04:19:09.707970', 'step': 3731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:19:09.809977', 'step': 3731, 'epoch': 3} {'type': 'loss', 'content': 0.0015480854781344533, 'timestamp': '2025-09-04 04:19:09.830021', 'step': 3732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:19:09.918419', 'step': 3732, 'epoch': 3} {'type': 'loss', 'content': 0.00010000986367231235, 'timestamp': '2025-09-04 04:19:09.936774', 'step': 3733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:10.037412', 'step': 3733, 'epoch': 3} {'type': 'loss', 'content': 0.014707312919199467, 'timestamp': '2025-09-04 04:19:10.056314', 'step': 3734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:19:10.139093', 'step': 3734, 'epoch': 3} {'type': 'loss', 'content': 0.0023547126911580563, 'timestamp': '2025-09-04 04:19:10.153214', 'step': 3735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:10.238659', 'step': 3735, 'epoch': 3} {'type': 'loss', 'content': 0.005836615804582834, 'timestamp': '2025-09-04 04:19:10.254980', 'step': 3736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:10.353590', 'step': 3736, 'epoch': 3} {'type': 'loss', 'content': 0.00020389581914059818, 'timestamp': '2025-09-04 04:19:10.374400', 'step': 3737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:10.478398', 'step': 3737, 'epoch': 3} {'type': 'loss', 'content': 0.0003900925803463906, 'timestamp': '2025-09-04 04:19:10.497684', 'step': 3738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:10.596881', 'step': 3738, 'epoch': 3} {'type': 'loss', 'content': 6.265490083023906e-05, 'timestamp': '2025-09-04 04:19:10.615552', 'step': 3739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 960], 'flops': 19200116623104.0}, 'timestamp': '2025-09-04 04:19:10.753104', 'step': 3739, 'epoch': 3} {'type': 'loss', 'content': 0.04083415865898132, 'timestamp': '2025-09-04 04:19:10.780282', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:19:19.338127', 'step': 3740, 'epoch': 3} {'type': 'pplx', 'content': 285.6040904345629, 'timestamp': '2025-09-04 04:19:19.340007', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:19:19.457232', 'step': 3740, 'epoch': 3} {'type': 'loss', 'content': 0.0010513699380680919, 'timestamp': '2025-09-04 04:19:19.482682', 'step': 3741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1376], 'flops': 27520167130496.0}, 'timestamp': '2025-09-04 04:19:19.687690', 'step': 3741, 'epoch': 3} {'type': 'loss', 'content': 0.0182512030005455, 'timestamp': '2025-09-04 04:19:19.726797', 'step': 3742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:19.838392', 'step': 3742, 'epoch': 3} {'type': 'loss', 'content': 0.0008120352867990732, 'timestamp': '2025-09-04 04:19:19.859139', 'step': 3743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:19:19.958398', 'step': 3743, 'epoch': 3} {'type': 'loss', 'content': 0.0015680500073358417, 'timestamp': '2025-09-04 04:19:19.976612', 'step': 3744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:19:20.080286', 'step': 3744, 'epoch': 3} {'type': 'loss', 'content': 0.007197872269898653, 'timestamp': '2025-09-04 04:19:20.099097', 'step': 3745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:19:20.227801', 'step': 3745, 'epoch': 3} {'type': 'loss', 'content': 0.011416764929890633, 'timestamp': '2025-09-04 04:19:20.248322', 'step': 3746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:20.402738', 'step': 3746, 'epoch': 3} {'type': 'loss', 'content': 0.004257888998836279, 'timestamp': '2025-09-04 04:19:20.423364', 'step': 3747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:20.563747', 'step': 3747, 'epoch': 3} {'type': 'loss', 'content': 0.004272022750228643, 'timestamp': '2025-09-04 04:19:20.583768', 'step': 3748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:19:20.696830', 'step': 3748, 'epoch': 3} {'type': 'loss', 'content': 0.0040796962566673756, 'timestamp': '2025-09-04 04:19:20.716140', 'step': 3749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:20.816046', 'step': 3749, 'epoch': 3} {'type': 'loss', 'content': 0.0027478632982820272, 'timestamp': '2025-09-04 04:19:20.835598', 'step': 3750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:19:20.984592', 'step': 3750, 'epoch': 3} {'type': 'loss', 'content': 0.0048896921798586845, 'timestamp': '2025-09-04 04:19:21.003477', 'step': 3751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:21.136153', 'step': 3751, 'epoch': 3} {'type': 'loss', 'content': 0.00022235576761886477, 'timestamp': '2025-09-04 04:19:21.152425', 'step': 3752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:19:21.253033', 'step': 3752, 'epoch': 3} {'type': 'loss', 'content': 0.009761957451701164, 'timestamp': '2025-09-04 04:19:21.272287', 'step': 3753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:21.379524', 'step': 3753, 'epoch': 3} {'type': 'loss', 'content': 0.006642633117735386, 'timestamp': '2025-09-04 04:19:21.398711', 'step': 3754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:21.513529', 'step': 3754, 'epoch': 3} {'type': 'loss', 'content': 0.0035070693120360374, 'timestamp': '2025-09-04 04:19:21.534261', 'step': 3755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:21.639442', 'step': 3755, 'epoch': 3} {'type': 'loss', 'content': 0.04340730234980583, 'timestamp': '2025-09-04 04:19:21.659626', 'step': 3756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:21.754047', 'step': 3756, 'epoch': 3} {'type': 'loss', 'content': 0.02027786523103714, 'timestamp': '2025-09-04 04:19:21.771006', 'step': 3757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 832], 'flops': 16640101082368.0}, 'timestamp': '2025-09-04 04:19:21.906935', 'step': 3757, 'epoch': 3} {'type': 'loss', 'content': 0.029796713963150978, 'timestamp': '2025-09-04 04:19:21.930003', 'step': 3758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:22.044773', 'step': 3758, 'epoch': 3} {'type': 'loss', 'content': 0.003318408038467169, 'timestamp': '2025-09-04 04:19:22.065227', 'step': 3759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:22.205933', 'step': 3759, 'epoch': 3} {'type': 'loss', 'content': 0.00017196725821122527, 'timestamp': '2025-09-04 04:19:22.228658', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:19:31.072582', 'step': 3760, 'epoch': 3} {'type': 'pplx', 'content': 286.6894182494486, 'timestamp': '2025-09-04 04:19:31.075091', 'step': 3760, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3760', 'timestamp': '2025-09-04 04:19:31.569145', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:19:31.696385', 'step': 3760, 'epoch': 3} {'type': 'loss', 'content': 0.031767360866069794, 'timestamp': '2025-09-04 04:19:31.718702', 'step': 3761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:19:31.824881', 'step': 3761, 'epoch': 3} {'type': 'loss', 'content': 0.008120999671518803, 'timestamp': '2025-09-04 04:19:31.843846', 'step': 3762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:19:31.923836', 'step': 3762, 'epoch': 3} {'type': 'loss', 'content': 0.04948921501636505, 'timestamp': '2025-09-04 04:19:31.937734', 'step': 3763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:32.043823', 'step': 3763, 'epoch': 3} {'type': 'loss', 'content': 0.0013902034843340516, 'timestamp': '2025-09-04 04:19:32.063625', 'step': 3764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:32.162247', 'step': 3764, 'epoch': 3} {'type': 'loss', 'content': 0.0031580275390297174, 'timestamp': '2025-09-04 04:19:32.182508', 'step': 3765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:19:32.277902', 'step': 3765, 'epoch': 3} {'type': 'loss', 'content': 0.0007031270652078092, 'timestamp': '2025-09-04 04:19:32.295167', 'step': 3766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:19:32.446073', 'step': 3766, 'epoch': 3} {'type': 'loss', 'content': 0.006634030491113663, 'timestamp': '2025-09-04 04:19:32.474137', 'step': 3767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:19:32.546675', 'step': 3767, 'epoch': 3} {'type': 'loss', 'content': 0.001953285885974765, 'timestamp': '2025-09-04 04:19:32.559867', 'step': 3768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:32.663220', 'step': 3768, 'epoch': 3} {'type': 'loss', 'content': 0.0026485491544008255, 'timestamp': '2025-09-04 04:19:32.684306', 'step': 3769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:19:32.796110', 'step': 3769, 'epoch': 3} {'type': 'loss', 'content': 0.0007451950805261731, 'timestamp': '2025-09-04 04:19:32.816335', 'step': 3770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1120], 'flops': 22400136049024.0}, 'timestamp': '2025-09-04 04:19:32.980248', 'step': 3770, 'epoch': 3} {'type': 'loss', 'content': 0.00019487171084620059, 'timestamp': '2025-09-04 04:19:33.012027', 'step': 3771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:19:33.098017', 'step': 3771, 'epoch': 3} {'type': 'loss', 'content': 0.0070730033330619335, 'timestamp': '2025-09-04 04:19:33.113870', 'step': 3772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:33.217274', 'step': 3772, 'epoch': 3} {'type': 'loss', 'content': 0.0077186161652207375, 'timestamp': '2025-09-04 04:19:33.238178', 'step': 3773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 416], 'flops': 8320050574976.0}, 'timestamp': '2025-09-04 04:19:33.310724', 'step': 3773, 'epoch': 3} {'type': 'loss', 'content': 0.003549723420292139, 'timestamp': '2025-09-04 04:19:33.323108', 'step': 3774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:33.436045', 'step': 3774, 'epoch': 3} {'type': 'loss', 'content': 0.026352064684033394, 'timestamp': '2025-09-04 04:19:33.456603', 'step': 3775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:33.544446', 'step': 3775, 'epoch': 3} {'type': 'loss', 'content': 0.007274407893419266, 'timestamp': '2025-09-04 04:19:33.560495', 'step': 3776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:19:33.663022', 'step': 3776, 'epoch': 3} {'type': 'loss', 'content': 0.005042992066591978, 'timestamp': '2025-09-04 04:19:33.683851', 'step': 3777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:33.785311', 'step': 3777, 'epoch': 3} {'type': 'loss', 'content': 0.00039034750079736114, 'timestamp': '2025-09-04 04:19:33.803786', 'step': 3778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:19:33.883644', 'step': 3778, 'epoch': 3} {'type': 'loss', 'content': 0.00030172173865139484, 'timestamp': '2025-09-04 04:19:33.897563', 'step': 3779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:19:34.007601', 'step': 3779, 'epoch': 3} {'type': 'loss', 'content': 0.017556700855493546, 'timestamp': '2025-09-04 04:19:34.028541', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:19:42.961188', 'step': 3780, 'epoch': 3} {'type': 'pplx', 'content': 286.5459421244075, 'timestamp': '2025-09-04 04:19:42.963267', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:43.045692', 'step': 3780, 'epoch': 3} {'type': 'loss', 'content': 0.012321644462645054, 'timestamp': '2025-09-04 04:19:43.062957', 'step': 3781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:19:43.157605', 'step': 3781, 'epoch': 3} {'type': 'loss', 'content': 0.002354747848585248, 'timestamp': '2025-09-04 04:19:43.175092', 'step': 3782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:19:43.282378', 'step': 3782, 'epoch': 3} {'type': 'loss', 'content': 0.026513388380408287, 'timestamp': '2025-09-04 04:19:43.302750', 'step': 3783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:19:43.378659', 'step': 3783, 'epoch': 3} {'type': 'loss', 'content': 0.003994217608124018, 'timestamp': '2025-09-04 04:19:43.393283', 'step': 3784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:19:43.496515', 'step': 3784, 'epoch': 3} {'type': 'loss', 'content': 0.001885236008092761, 'timestamp': '2025-09-04 04:19:43.518555', 'step': 3785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:43.630362', 'step': 3785, 'epoch': 3} {'type': 'loss', 'content': 0.011333000846207142, 'timestamp': '2025-09-04 04:19:43.651043', 'step': 3786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:19:43.744476', 'step': 3786, 'epoch': 3} {'type': 'loss', 'content': 0.0012821113923564553, 'timestamp': '2025-09-04 04:19:43.761698', 'step': 3787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:43.863342', 'step': 3787, 'epoch': 3} {'type': 'loss', 'content': 0.0025108223780989647, 'timestamp': '2025-09-04 04:19:43.882925', 'step': 3788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1088], 'flops': 21760132163840.0}, 'timestamp': '2025-09-04 04:19:44.035332', 'step': 3788, 'epoch': 3} {'type': 'loss', 'content': 0.016976939514279366, 'timestamp': '2025-09-04 04:19:44.069075', 'step': 3789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:44.176089', 'step': 3789, 'epoch': 3} {'type': 'loss', 'content': 0.007931654341518879, 'timestamp': '2025-09-04 04:19:44.195309', 'step': 3790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:19:44.291098', 'step': 3790, 'epoch': 3} {'type': 'loss', 'content': 0.005856201983988285, 'timestamp': '2025-09-04 04:19:44.308591', 'step': 3791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:19:44.417595', 'step': 3791, 'epoch': 3} {'type': 'loss', 'content': 0.0056821079924702644, 'timestamp': '2025-09-04 04:19:44.438676', 'step': 3792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:19:44.523040', 'step': 3792, 'epoch': 3} {'type': 'loss', 'content': 0.00843026302754879, 'timestamp': '2025-09-04 04:19:44.540136', 'step': 3793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:44.633107', 'step': 3793, 'epoch': 3} {'type': 'loss', 'content': 0.009367075748741627, 'timestamp': '2025-09-04 04:19:44.648653', 'step': 3794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:44.749854', 'step': 3794, 'epoch': 3} {'type': 'loss', 'content': 0.01958874613046646, 'timestamp': '2025-09-04 04:19:44.768794', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:19:44.863243', 'step': 3795, 'epoch': 3} {'type': 'loss', 'content': 0.003997324500232935, 'timestamp': '2025-09-04 04:19:44.881646', 'step': 3796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:44.986383', 'step': 3796, 'epoch': 3} {'type': 'loss', 'content': 0.00712942611426115, 'timestamp': '2025-09-04 04:19:45.007700', 'step': 3797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 800], 'flops': 16000097197184.0}, 'timestamp': '2025-09-04 04:19:45.127722', 'step': 3797, 'epoch': 3} {'type': 'loss', 'content': 0.004085378255695105, 'timestamp': '2025-09-04 04:19:45.149603', 'step': 3798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:19:45.247231', 'step': 3798, 'epoch': 3} {'type': 'loss', 'content': 0.0033143635373562574, 'timestamp': '2025-09-04 04:19:45.264092', 'step': 3799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:19:45.363499', 'step': 3799, 'epoch': 3} {'type': 'loss', 'content': 0.01740036904811859, 'timestamp': '2025-09-04 04:19:45.381789', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:19:53.897411', 'step': 3800, 'epoch': 3} {'type': 'pplx', 'content': 287.8095566615489, 'timestamp': '2025-09-04 04:19:53.899499', 'step': 3800, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3800', 'timestamp': '2025-09-04 04:19:54.373165', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 384], 'flops': 7680046689792.0}, 'timestamp': '2025-09-04 04:19:54.434055', 'step': 3800, 'epoch': 3} {'type': 'loss', 'content': 0.004595032427459955, 'timestamp': '2025-09-04 04:19:54.446095', 'step': 3801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:19:54.529226', 'step': 3801, 'epoch': 3} {'type': 'loss', 'content': 0.005345925688743591, 'timestamp': '2025-09-04 04:19:54.544393', 'step': 3802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 896], 'flops': 17920108852736.0}, 'timestamp': '2025-09-04 04:19:54.673012', 'step': 3802, 'epoch': 3} {'type': 'loss', 'content': 0.0032179898116737604, 'timestamp': '2025-09-04 04:19:54.697760', 'step': 3803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:54.784993', 'step': 3803, 'epoch': 3} {'type': 'loss', 'content': 0.011905638501048088, 'timestamp': '2025-09-04 04:19:54.801510', 'step': 3804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:54.909779', 'step': 3804, 'epoch': 3} {'type': 'loss', 'content': 0.00038323350599966943, 'timestamp': '2025-09-04 04:19:54.932539', 'step': 3805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:19:55.042815', 'step': 3805, 'epoch': 3} {'type': 'loss', 'content': 0.010173635557293892, 'timestamp': '2025-09-04 04:19:55.063495', 'step': 3806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:55.167220', 'step': 3806, 'epoch': 3} {'type': 'loss', 'content': 0.010474266484379768, 'timestamp': '2025-09-04 04:19:55.186635', 'step': 3807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:55.289309', 'step': 3807, 'epoch': 3} {'type': 'loss', 'content': 0.006612957455217838, 'timestamp': '2025-09-04 04:19:55.309016', 'step': 3808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:19:55.385129', 'step': 3808, 'epoch': 3} {'type': 'loss', 'content': 0.005502256099134684, 'timestamp': '2025-09-04 04:19:55.400714', 'step': 3809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:19:55.519741', 'step': 3809, 'epoch': 3} {'type': 'loss', 'content': 0.0005113132647238672, 'timestamp': '2025-09-04 04:19:55.540420', 'step': 3810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:19:55.643172', 'step': 3810, 'epoch': 3} {'type': 'loss', 'content': 0.004534454550594091, 'timestamp': '2025-09-04 04:19:55.662408', 'step': 3811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:55.766556', 'step': 3811, 'epoch': 3} {'type': 'loss', 'content': 0.0004610498435795307, 'timestamp': '2025-09-04 04:19:55.786680', 'step': 3812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:19:55.874100', 'step': 3812, 'epoch': 3} {'type': 'loss', 'content': 0.01628238335251808, 'timestamp': '2025-09-04 04:19:55.892557', 'step': 3813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:55.996550', 'step': 3813, 'epoch': 3} {'type': 'loss', 'content': 0.0012509598163887858, 'timestamp': '2025-09-04 04:19:56.015759', 'step': 3814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:56.116208', 'step': 3814, 'epoch': 3} {'type': 'loss', 'content': 0.002474145032465458, 'timestamp': '2025-09-04 04:19:56.135188', 'step': 3815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:19:56.239734', 'step': 3815, 'epoch': 3} {'type': 'loss', 'content': 0.0014434803742915392, 'timestamp': '2025-09-04 04:19:56.259899', 'step': 3816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 816], 'flops': 16320099139776.0}, 'timestamp': '2025-09-04 04:19:56.379011', 'step': 3816, 'epoch': 3} {'type': 'loss', 'content': 0.0014426189009100199, 'timestamp': '2025-09-04 04:19:56.404527', 'step': 3817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 576], 'flops': 11520070000896.0}, 'timestamp': '2025-09-04 04:19:56.491449', 'step': 3817, 'epoch': 3} {'type': 'loss', 'content': 0.005996841937303543, 'timestamp': '2025-09-04 04:19:56.507188', 'step': 3818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:19:56.606655', 'step': 3818, 'epoch': 3} {'type': 'loss', 'content': 0.0009320307872258127, 'timestamp': '2025-09-04 04:19:56.625375', 'step': 3819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:19:56.737655', 'step': 3819, 'epoch': 3} {'type': 'loss', 'content': 0.028978558257222176, 'timestamp': '2025-09-04 04:19:56.757294', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:05.260868', 'step': 3820, 'epoch': 3} {'type': 'pplx', 'content': 290.2094912937652, 'timestamp': '2025-09-04 04:20:05.263045', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:05.362272', 'step': 3820, 'epoch': 3} {'type': 'loss', 'content': 0.0015448291087523103, 'timestamp': '2025-09-04 04:20:05.383584', 'step': 3821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:20:05.479527', 'step': 3821, 'epoch': 3} {'type': 'loss', 'content': 0.013662063516676426, 'timestamp': '2025-09-04 04:20:05.497092', 'step': 3822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:05.601600', 'step': 3822, 'epoch': 3} {'type': 'loss', 'content': 0.003193710930645466, 'timestamp': '2025-09-04 04:20:05.620767', 'step': 3823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:20:05.722416', 'step': 3823, 'epoch': 3} {'type': 'loss', 'content': 0.026605522260069847, 'timestamp': '2025-09-04 04:20:05.741885', 'step': 3824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:20:05.840824', 'step': 3824, 'epoch': 3} {'type': 'loss', 'content': 0.0024066600017249584, 'timestamp': '2025-09-04 04:20:05.861635', 'step': 3825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:05.965300', 'step': 3825, 'epoch': 3} {'type': 'loss', 'content': 0.0002010101597988978, 'timestamp': '2025-09-04 04:20:05.984532', 'step': 3826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:20:06.093413', 'step': 3826, 'epoch': 3} {'type': 'loss', 'content': 0.01071922481060028, 'timestamp': '2025-09-04 04:20:06.113643', 'step': 3827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:20:06.231680', 'step': 3827, 'epoch': 3} {'type': 'loss', 'content': 0.00014394304889719933, 'timestamp': '2025-09-04 04:20:06.254710', 'step': 3828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:20:06.346407', 'step': 3828, 'epoch': 3} {'type': 'loss', 'content': 0.001017340342514217, 'timestamp': '2025-09-04 04:20:06.365595', 'step': 3829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 512], 'flops': 10240062230528.0}, 'timestamp': '2025-09-04 04:20:06.443920', 'step': 3829, 'epoch': 3} {'type': 'loss', 'content': 0.00646333210170269, 'timestamp': '2025-09-04 04:20:06.458192', 'step': 3830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:20:06.564966', 'step': 3830, 'epoch': 3} {'type': 'loss', 'content': 0.002317563397809863, 'timestamp': '2025-09-04 04:20:06.585040', 'step': 3831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:06.688740', 'step': 3831, 'epoch': 3} {'type': 'loss', 'content': 0.025038516148924828, 'timestamp': '2025-09-04 04:20:06.708726', 'step': 3832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:20:06.790921', 'step': 3832, 'epoch': 3} {'type': 'loss', 'content': 0.0008068301249295473, 'timestamp': '2025-09-04 04:20:06.807722', 'step': 3833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 784], 'flops': 15680095254592.0}, 'timestamp': '2025-09-04 04:20:06.924763', 'step': 3833, 'epoch': 3} {'type': 'loss', 'content': 0.007588067092001438, 'timestamp': '2025-09-04 04:20:06.946951', 'step': 3834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:20:07.023668', 'step': 3834, 'epoch': 3} {'type': 'loss', 'content': 0.0007368748192675412, 'timestamp': '2025-09-04 04:20:07.037543', 'step': 3835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:20:07.144531', 'step': 3835, 'epoch': 3} {'type': 'loss', 'content': 0.00043095918954350054, 'timestamp': '2025-09-04 04:20:07.165248', 'step': 3836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:20:07.273966', 'step': 3836, 'epoch': 3} {'type': 'loss', 'content': 0.004333295859396458, 'timestamp': '2025-09-04 04:20:07.296790', 'step': 3837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:07.394901', 'step': 3837, 'epoch': 3} {'type': 'loss', 'content': 0.0017703445628285408, 'timestamp': '2025-09-04 04:20:07.412129', 'step': 3838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:20:07.495915', 'step': 3838, 'epoch': 3} {'type': 'loss', 'content': 0.03642124682664871, 'timestamp': '2025-09-04 04:20:07.511279', 'step': 3839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:07.604411', 'step': 3839, 'epoch': 3} {'type': 'loss', 'content': 0.015598982572555542, 'timestamp': '2025-09-04 04:20:07.622411', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:16.109142', 'step': 3840, 'epoch': 3} {'type': 'pplx', 'content': 293.44611506915754, 'timestamp': '2025-09-04 04:20:16.111391', 'step': 3840, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3840', 'timestamp': '2025-09-04 04:20:16.624255', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:20:16.730224', 'step': 3840, 'epoch': 3} {'type': 'loss', 'content': 0.03652084618806839, 'timestamp': '2025-09-04 04:20:16.752751', 'step': 3841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:20:16.863261', 'step': 3841, 'epoch': 3} {'type': 'loss', 'content': 0.0017639618599787354, 'timestamp': '2025-09-04 04:20:16.883817', 'step': 3842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:20:16.978707', 'step': 3842, 'epoch': 3} {'type': 'loss', 'content': 0.011736215092241764, 'timestamp': '2025-09-04 04:20:16.996332', 'step': 3843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 432], 'flops': 8640052517568.0}, 'timestamp': '2025-09-04 04:20:17.068336', 'step': 3843, 'epoch': 3} {'type': 'loss', 'content': 0.0006759539246559143, 'timestamp': '2025-09-04 04:20:17.081947', 'step': 3844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:20:17.165333', 'step': 3844, 'epoch': 3} {'type': 'loss', 'content': 0.001056056353263557, 'timestamp': '2025-09-04 04:20:17.181929', 'step': 3845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:20:17.282369', 'step': 3845, 'epoch': 3} {'type': 'loss', 'content': 0.00557122053578496, 'timestamp': '2025-09-04 04:20:17.301087', 'step': 3846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:20:17.402195', 'step': 3846, 'epoch': 3} {'type': 'loss', 'content': 0.010494154877960682, 'timestamp': '2025-09-04 04:20:17.421155', 'step': 3847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:17.525683', 'step': 3847, 'epoch': 3} {'type': 'loss', 'content': 0.0003855594841297716, 'timestamp': '2025-09-04 04:20:17.545741', 'step': 3848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:20:17.643547', 'step': 3848, 'epoch': 3} {'type': 'loss', 'content': 0.0037584335077553988, 'timestamp': '2025-09-04 04:20:17.664083', 'step': 3849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:20:17.772097', 'step': 3849, 'epoch': 3} {'type': 'loss', 'content': 0.008188803680241108, 'timestamp': '2025-09-04 04:20:17.792209', 'step': 3850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:17.898084', 'step': 3850, 'epoch': 3} {'type': 'loss', 'content': 0.0006697289645671844, 'timestamp': '2025-09-04 04:20:17.917502', 'step': 3851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:20:17.995980', 'step': 3851, 'epoch': 3} {'type': 'loss', 'content': 0.025056758895516396, 'timestamp': '2025-09-04 04:20:18.010432', 'step': 3852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:20:18.106741', 'step': 3852, 'epoch': 3} {'type': 'loss', 'content': 0.005154153797775507, 'timestamp': '2025-09-04 04:20:18.125833', 'step': 3853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 720], 'flops': 14400087484224.0}, 'timestamp': '2025-09-04 04:20:18.232402', 'step': 3853, 'epoch': 3} {'type': 'loss', 'content': 0.009047658182680607, 'timestamp': '2025-09-04 04:20:18.252461', 'step': 3854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:20:18.339002', 'step': 3854, 'epoch': 3} {'type': 'loss', 'content': 0.010006698779761791, 'timestamp': '2025-09-04 04:20:18.354353', 'step': 3855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:18.458829', 'step': 3855, 'epoch': 3} {'type': 'loss', 'content': 0.00595315545797348, 'timestamp': '2025-09-04 04:20:18.476860', 'step': 3856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 592], 'flops': 11840071943488.0}, 'timestamp': '2025-09-04 04:20:18.566251', 'step': 3856, 'epoch': 3} {'type': 'loss', 'content': 0.0007929496932774782, 'timestamp': '2025-09-04 04:20:18.584632', 'step': 3857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:20:18.661728', 'step': 3857, 'epoch': 3} {'type': 'loss', 'content': 0.0036627210211008787, 'timestamp': '2025-09-04 04:20:18.675620', 'step': 3858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:18.778203', 'step': 3858, 'epoch': 3} {'type': 'loss', 'content': 0.0008201138116419315, 'timestamp': '2025-09-04 04:20:18.797517', 'step': 3859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:18.900813', 'step': 3859, 'epoch': 3} {'type': 'loss', 'content': 0.0017376311589032412, 'timestamp': '2025-09-04 04:20:18.920853', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:27.435062', 'step': 3860, 'epoch': 3} {'type': 'pplx', 'content': 294.3503277315976, 'timestamp': '2025-09-04 04:20:27.437138', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:20:27.517188', 'step': 3860, 'epoch': 3} {'type': 'loss', 'content': 0.011900283396244049, 'timestamp': '2025-09-04 04:20:27.533953', 'step': 3861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:20:27.642933', 'step': 3861, 'epoch': 3} {'type': 'loss', 'content': 0.015471918508410454, 'timestamp': '2025-09-04 04:20:27.663390', 'step': 3862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:27.766648', 'step': 3862, 'epoch': 3} {'type': 'loss', 'content': 0.001793242641724646, 'timestamp': '2025-09-04 04:20:27.785811', 'step': 3863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:27.879737', 'step': 3863, 'epoch': 3} {'type': 'loss', 'content': 0.00023462541867047548, 'timestamp': '2025-09-04 04:20:27.897669', 'step': 3864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:20:28.002948', 'step': 3864, 'epoch': 3} {'type': 'loss', 'content': 0.005305502098053694, 'timestamp': '2025-09-04 04:20:28.025333', 'step': 3865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:20:28.145124', 'step': 3865, 'epoch': 3} {'type': 'loss', 'content': 0.03417619690299034, 'timestamp': '2025-09-04 04:20:28.165784', 'step': 3866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 1024], 'flops': 20480124393472.0}, 'timestamp': '2025-09-04 04:20:28.313306', 'step': 3866, 'epoch': 3} {'type': 'loss', 'content': 0.007200147025287151, 'timestamp': '2025-09-04 04:20:28.341721', 'step': 3867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:20:28.443445', 'step': 3867, 'epoch': 3} {'type': 'loss', 'content': 0.04463130235671997, 'timestamp': '2025-09-04 04:20:28.470677', 'step': 3868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:28.618239', 'step': 3868, 'epoch': 3} {'type': 'loss', 'content': 0.00033794058253988624, 'timestamp': '2025-09-04 04:20:28.638786', 'step': 3869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 544], 'flops': 10880066115712.0}, 'timestamp': '2025-09-04 04:20:28.741173', 'step': 3869, 'epoch': 3} {'type': 'loss', 'content': 0.09599971026182175, 'timestamp': '2025-09-04 04:20:28.756389', 'step': 3870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:28.860971', 'step': 3870, 'epoch': 3} {'type': 'loss', 'content': 0.00989855919033289, 'timestamp': '2025-09-04 04:20:28.880352', 'step': 3871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 480], 'flops': 9600058345344.0}, 'timestamp': '2025-09-04 04:20:28.959284', 'step': 3871, 'epoch': 3} {'type': 'loss', 'content': 0.020102083683013916, 'timestamp': '2025-09-04 04:20:28.973935', 'step': 3872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:20:29.058460', 'step': 3872, 'epoch': 3} {'type': 'loss', 'content': 0.010750843212008476, 'timestamp': '2025-09-04 04:20:29.075537', 'step': 3873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 736], 'flops': 14720089426816.0}, 'timestamp': '2025-09-04 04:20:29.187735', 'step': 3873, 'epoch': 3} {'type': 'loss', 'content': 0.024111559614539146, 'timestamp': '2025-09-04 04:20:29.208011', 'step': 3874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:20:29.322373', 'step': 3874, 'epoch': 3} {'type': 'loss', 'content': 0.0005178121500648558, 'timestamp': '2025-09-04 04:20:29.342893', 'step': 3875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:29.439124', 'step': 3875, 'epoch': 3} {'type': 'loss', 'content': 0.008165943436324596, 'timestamp': '2025-09-04 04:20:29.457030', 'step': 3876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:20:29.554137', 'step': 3876, 'epoch': 3} {'type': 'loss', 'content': 0.0027514963876456022, 'timestamp': '2025-09-04 04:20:29.573166', 'step': 3877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:29.679149', 'step': 3877, 'epoch': 3} {'type': 'loss', 'content': 0.007004078943282366, 'timestamp': '2025-09-04 04:20:29.698344', 'step': 3878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 464], 'flops': 9280056402752.0}, 'timestamp': '2025-09-04 04:20:29.781681', 'step': 3878, 'epoch': 3} {'type': 'loss', 'content': 0.0011216630227863789, 'timestamp': '2025-09-04 04:20:29.794983', 'step': 3879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:20:29.883567', 'step': 3879, 'epoch': 3} {'type': 'loss', 'content': 0.008931388147175312, 'timestamp': '2025-09-04 04:20:29.899790', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:38.519446', 'step': 3880, 'epoch': 3} {'type': 'pplx', 'content': 287.92952489903456, 'timestamp': '2025-09-04 04:20:38.522894', 'step': 3880, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3880', 'timestamp': '2025-09-04 04:20:39.031240', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 688], 'flops': 13760083599040.0}, 'timestamp': '2025-09-04 04:20:39.130503', 'step': 3880, 'epoch': 3} {'type': 'loss', 'content': 0.0010395454009994864, 'timestamp': '2025-09-04 04:20:39.151468', 'step': 3881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 560], 'flops': 11200068058304.0}, 'timestamp': '2025-09-04 04:20:39.237424', 'step': 3881, 'epoch': 3} {'type': 'loss', 'content': 0.004671165719628334, 'timestamp': '2025-09-04 04:20:39.252861', 'step': 3882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:20:39.348868', 'step': 3882, 'epoch': 3} {'type': 'loss', 'content': 0.004468918778002262, 'timestamp': '2025-09-04 04:20:39.366431', 'step': 3883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:39.461021', 'step': 3883, 'epoch': 3} {'type': 'loss', 'content': 0.00038716281414963305, 'timestamp': '2025-09-04 04:20:39.478989', 'step': 3884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 624], 'flops': 12480075828672.0}, 'timestamp': '2025-09-04 04:20:39.580317', 'step': 3884, 'epoch': 3} {'type': 'loss', 'content': 0.0007503409287892282, 'timestamp': '2025-09-04 04:20:39.599412', 'step': 3885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:20:39.719678', 'step': 3885, 'epoch': 3} {'type': 'loss', 'content': 0.009064269252121449, 'timestamp': '2025-09-04 04:20:39.738284', 'step': 3886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 656], 'flops': 13120079713856.0}, 'timestamp': '2025-09-04 04:20:39.837794', 'step': 3886, 'epoch': 3} {'type': 'loss', 'content': 0.00859552901238203, 'timestamp': '2025-09-04 04:20:39.856340', 'step': 3887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 640], 'flops': 12800077771264.0}, 'timestamp': '2025-09-04 04:20:39.952242', 'step': 3887, 'epoch': 3} {'type': 'loss', 'content': 0.007069156970828772, 'timestamp': '2025-09-04 04:20:39.970460', 'step': 3888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:20:40.080145', 'step': 3888, 'epoch': 3} {'type': 'loss', 'content': 0.003085685195401311, 'timestamp': '2025-09-04 04:20:40.102925', 'step': 3889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 528], 'flops': 10560064173120.0}, 'timestamp': '2025-09-04 04:20:40.188536', 'step': 3889, 'epoch': 3} {'type': 'loss', 'content': 0.013397028669714928, 'timestamp': '2025-09-04 04:20:40.203557', 'step': 3890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 704], 'flops': 14080085541632.0}, 'timestamp': '2025-09-04 04:20:40.308276', 'step': 3890, 'epoch': 3} {'type': 'loss', 'content': 0.0029367466922849417, 'timestamp': '2025-09-04 04:20:40.327712', 'step': 3891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:20:40.430752', 'step': 3891, 'epoch': 3} {'type': 'loss', 'content': 0.0016781740123406053, 'timestamp': '2025-09-04 04:20:40.450543', 'step': 3892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:40.545165', 'step': 3892, 'epoch': 3} {'type': 'loss', 'content': 0.0011187720810994506, 'timestamp': '2025-09-04 04:20:40.563906', 'step': 3893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 768], 'flops': 15360093312000.0}, 'timestamp': '2025-09-04 04:20:40.673482', 'step': 3893, 'epoch': 3} {'type': 'loss', 'content': 0.007320099975913763, 'timestamp': '2025-09-04 04:20:40.694144', 'step': 3894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 672], 'flops': 13440081656448.0}, 'timestamp': '2025-09-04 04:20:40.805033', 'step': 3894, 'epoch': 3} {'type': 'loss', 'content': 0.0027296245098114014, 'timestamp': '2025-09-04 04:20:40.824078', 'step': 3895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 912], 'flops': 18240110795328.0}, 'timestamp': '2025-09-04 04:20:40.958781', 'step': 3895, 'epoch': 3} {'type': 'loss', 'content': 0.0014986982569098473, 'timestamp': '2025-09-04 04:20:40.984293', 'step': 3896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:41.076680', 'step': 3896, 'epoch': 3} {'type': 'loss', 'content': 0.011014275252819061, 'timestamp': '2025-09-04 04:20:41.095522', 'step': 3897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 496], 'flops': 9920060287936.0}, 'timestamp': '2025-09-04 04:20:41.174621', 'step': 3897, 'epoch': 3} {'type': 'loss', 'content': 0.0021178617607802153, 'timestamp': '2025-09-04 04:20:41.188639', 'step': 3898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 608], 'flops': 12160073886080.0}, 'timestamp': '2025-09-04 04:20:41.283021', 'step': 3898, 'epoch': 3} {'type': 'loss', 'content': 0.03507016599178314, 'timestamp': '2025-09-04 04:20:41.300244', 'step': 3899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 752], 'flops': 15040091369408.0}, 'timestamp': '2025-09-04 04:20:41.413224', 'step': 3899, 'epoch': 3} {'type': 'loss', 'content': 0.0063927737064659595, 'timestamp': '2025-09-04 04:20:41.434668', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:49.925006', 'step': 3900, 'epoch': 3} {'type': 'pplx', 'content': 280.9776699653499, 'timestamp': '2025-09-04 04:20:49.927474', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1200], 'batch_size': 8, 'flops': 23953716633984}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 1424], 'batch_size': 8, 'flops': 28425077059712}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 528], 'batch_size': 8, 'flops': 10539635356800}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 544], 'batch_size': 8, 'flops': 10859018244352}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 880], 'batch_size': 8, 'flops': 17566058882944}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 784], 'batch_size': 8, 'flops': 15649761557632}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 736], 'batch_size': 8, 'flops': 14691612894976}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 592], 'batch_size': 8, 'flops': 11817166907008}, {'type': 'perplexity', 'in_batch_dim': [8, 576], 'batch_size': 8, 'flops': 11497784019456}, {'type': 'perplexity', 'in_batch_dim': [8, 608], 'batch_size': 8, 'flops': 12136549794560}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 704], 'batch_size': 8, 'flops': 14052847119872}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 1344], 'batch_size': 8, 'flops': 26828162621952}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 624], 'batch_size': 8, 'flops': 12455932682112}, {'type': 'perplexity', 'in_batch_dim': [8, 688], 'batch_size': 8, 'flops': 13733464232320}, {'type': 'perplexity', 'in_batch_dim': [8, 1376], 'batch_size': 8, 'flops': 27466928397056}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 720], 'batch_size': 8, 'flops': 14372230007424}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 672], 'batch_size': 8, 'flops': 13414081344768}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 560], 'batch_size': 8, 'flops': 11178401131904}, {'type': 'perplexity', 'in_batch_dim': [8, 640], 'batch_size': 8, 'flops': 12775315569664}, {'type': 'perplexity', 'in_batch_dim': [8, 656], 'batch_size': 8, 'flops': 13094698457216}, {'type': 'perplexity', 'in_batch_dim': [2, 384], 'batch_size': 8, 'flops': 7665189368832}], 'timestamp': '2025-09-04 04:20:58.637186', 'step': 3900, 'epoch': 3} {'type': 'pplx', 'content': 280.9776699653499, 'timestamp': '2025-09-04 04:20:58.649417', 'step': 3900, 'epoch': 3} {'type': 'best_pplx', 'content': 255.24736725474492, 'timestamp': '2025-09-04 04:20:58.657551', 'step': 3900, 'epoch': 3} {'type': 'best_step', 'content': 3640, 'timestamp': '2025-09-04 04:20:58.671847', 'step': 3900, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 224996302651284480, 'timestamp': '2025-09-04 04:20:58.684121', 'step': 3900, 'epoch': 3} {'type': 'total_train_flops', 'content': 5.201887604744794e+16, 'timestamp': '2025-09-04 04:20:59.236260', 'step': 3900, 'epoch': 3}